From 27536fcba381a2153ac95ec8cec0599d2586b827 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Tue, 17 Sep 2024 02:30:48 -0400 Subject: [PATCH 001/145] test: add tests for case sensitive regex in `str.contains` (#986) * add test for case sensitive contains * fix typo in test --- tests/expr_and_series/str/contains_test.py | 28 ++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/tests/expr_and_series/str/contains_test.py b/tests/expr_and_series/str/contains_test.py index 312de50a4..6b9e74b69 100644 --- a/tests/expr_and_series/str/contains_test.py +++ b/tests/expr_and_series/str/contains_test.py @@ -14,7 +14,9 @@ df_polars = pl.DataFrame(data) -def test_contains(constructor: Constructor, request: pytest.FixtureRequest) -> None: +def test_contains_case_insensitive( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: if "cudf" in str(constructor): request.applymarker(pytest.mark.xfail) @@ -29,7 +31,9 @@ def test_contains(constructor: Constructor, request: pytest.FixtureRequest) -> N compare_dicts(result, expected) -def test_contains_series(constructor_eager: Any, request: pytest.FixtureRequest) -> None: +def test_contains_series_case_insensitive( + constructor_eager: Any, request: pytest.FixtureRequest +) -> None: if "cudf" in str(constructor_eager): request.applymarker(pytest.mark.xfail) @@ -42,3 +46,23 @@ def test_contains_series(constructor_eager: Any, request: pytest.FixtureRequest) "case_insensitive_match": [False, False, True, True], } compare_dicts(result, expected) + + +def test_contains_case_sensitive(constructor: Constructor) -> None: + df = nw.from_native(constructor(data)) + result = df.with_columns(nw.col("pets").str.contains("parrot|Dove").alias("result")) + expected = { + "pets": ["cat", "dog", "rabbit and parrot", "dove"], + "result": [False, False, True, False], + } + compare_dicts(result, expected) + + +def test_contains_series_case_sensitive(constructor_eager: Any) -> None: + df = nw.from_native(constructor_eager(data), eager_only=True) + result = df.with_columns(case_sensitive_match=df["pets"].str.contains("parrot|Dove")) + expected = { + "pets": ["cat", "dog", "rabbit and parrot", "dove"], + "case_sensitive_match": [False, False, True, False], + } + compare_dicts(result, expected) From 733c5e957ef927b37118d63f88afc721a5950d9a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:03:35 +0200 Subject: [PATCH 002/145] [pre-commit.ci] pre-commit autoupdate (#984) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.6.4 → v0.6.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.4...v0.6.5) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 04e41ea30..e97fce29d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.6.4' + rev: 'v0.6.5' hooks: # Run the formatter. - id: ruff-format From 9353ef9f681f032feee777fabbb8db8ae98aa79d Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Wed, 18 Sep 2024 02:44:04 -0400 Subject: [PATCH 003/145] test: convert cuDF numpy types to python types in tests (#987) * convert cuDF scalars to python types * add pragma: no cover to tests --- tests/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/utils.py b/tests/utils.py index b13bec192..e8b453f9d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -39,6 +39,10 @@ def compare_dicts(result: Any, expected: dict[str, Any]) -> None: lhs = lhs.as_py() # noqa: PLW2901 if hasattr(rhs, "as_py"): # pragma: no cover rhs = rhs.as_py() # noqa: PLW2901 + if hasattr(lhs, "item"): # pragma: no cover + lhs = lhs.item() # noqa: PLW2901 + if hasattr(rhs, "item"): # pragma: no cover + rhs = rhs.item() # noqa: PLW2901 if isinstance(lhs, float) and not math.isnan(lhs): assert math.isclose(lhs, rhs, rel_tol=0, abs_tol=1e-6), (lhs, rhs) elif isinstance(lhs, float) and math.isnan(lhs): From d5d27a9fd4bcd92728a69fbbfaa3a1fbe7b405fe Mon Sep 17 00:00:00 2001 From: Atul Chhotray Date: Thu, 19 Sep 2024 20:56:57 +0200 Subject: [PATCH 004/145] Update CONTRIBUTING.md (#996) Fixed link to 'how Narwhals works' --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index aeed2538f..f2ed79c62 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -153,7 +153,7 @@ listed above in [Working with local development environment](#working-with-local ## How it works If Narwhals looks like underwater unicorn magic to you, then please read -[how it works](https://narwhals-dev.github.io/narwhals/how-it-works/). +[how it works](https://narwhals-dev.github.io/narwhals/how_it_works/). ## Imports From a52d4702f495c48a71657011794c4c8dc79c8c4e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 20 Sep 2024 10:45:45 +0200 Subject: [PATCH 005/145] skip changelog(deps): bump astral-sh/setup-uv from 2 to 3 (#985) Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 2 to 3. - [Release notes](https://github.com/astral-sh/setup-uv/releases) - [Commits](https://github.com/astral-sh/setup-uv/compare/v2...v3) --- updated-dependencies: - dependency-name: astral-sh/setup-uv dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/check_docs_build.yml | 2 +- .github/workflows/check_tpch_queries.yml | 2 +- .github/workflows/downstream_tests.yml | 4 ++-- .github/workflows/extremes.yml | 8 ++++---- .github/workflows/pytest.yml | 6 +++--- .github/workflows/random_ci_pytest.yml | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/check_docs_build.yml b/.github/workflows/check_docs_build.yml index c59e67f46..1602bff42 100644 --- a/.github/workflows/check_docs_build.yml +++ b/.github/workflows/check_docs_build.yml @@ -19,7 +19,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml index 509ebb95b..f1bbde928 100644 --- a/.github/workflows/check_tpch_queries.yml +++ b/.github/workflows/check_tpch_queries.yml @@ -19,7 +19,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml index 26e9edf7a..0dcb78209 100644 --- a/.github/workflows/downstream_tests.yml +++ b/.github/workflows/downstream_tests.yml @@ -19,7 +19,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -63,7 +63,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index cec9d32d7..f11a4f4bb 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -19,7 +19,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -46,7 +46,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -75,7 +75,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -104,7 +104,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 39b5c91b3..179891ddf 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -19,7 +19,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -46,7 +46,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -75,7 +75,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/random_ci_pytest.yml b/.github/workflows/random_ci_pytest.yml index b029aba3c..addc73ed3 100644 --- a/.github/workflows/random_ci_pytest.yml +++ b/.github/workflows/random_ci_pytest.yml @@ -17,7 +17,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v3 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} From f0b31ee7ce3569985e3ebed6c116567eda53fd6d Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Fri, 20 Sep 2024 10:13:06 +0100 Subject: [PATCH 006/145] Bug: `__getitem__` check for edge cases such as subsetting 0 rows or 0 columns (#994) --- narwhals/_arrow/dataframe.py | 60 +++++++++++++------------- narwhals/_arrow/utils.py | 22 ++++++++++ narwhals/_pandas_like/dataframe.py | 43 +++++++++++-------- narwhals/_pandas_like/utils.py | 10 +++++ narwhals/_polars/dataframe.py | 68 ++++++++++++++++++------------ narwhals/_polars/utils.py | 9 ++++ narwhals/dataframe.py | 19 +++++++-- narwhals/stable/v1.py | 3 ++ narwhals/utils.py | 5 +++ tests/frame/slice_test.py | 33 ++++++++++++++- tests/series_only/slice_test.py | 6 +++ 11 files changed, 201 insertions(+), 77 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 428e83e3b..5b74ddb73 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -9,7 +9,8 @@ from typing import overload from narwhals._arrow.utils import broadcast_series -from narwhals._arrow.utils import convert_slice_to_nparray +from narwhals._arrow.utils import convert_str_slice_to_int_slice +from narwhals._arrow.utils import select_rows from narwhals._arrow.utils import translate_dtype from narwhals._arrow.utils import validate_dataframe_comparand from narwhals._expression_parsing import evaluate_into_exprs @@ -18,6 +19,7 @@ from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_unique_token +from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop if TYPE_CHECKING: @@ -121,6 +123,9 @@ def __getitem__(self, item: str) -> ArrowSeries: ... @overload def __getitem__(self, item: slice) -> ArrowDataFrame: ... + @overload + def __getitem__(self, item: tuple[slice, slice]) -> ArrowDataFrame: ... + def __getitem__( self, item: str @@ -128,7 +133,8 @@ def __getitem__( | Sequence[int] | Sequence[str] | tuple[Sequence[int], str | int] - | tuple[slice, str | int], + | tuple[slice, str | int] + | tuple[slice, slice], ) -> ArrowSeries | ArrowDataFrame: if isinstance(item, str): from narwhals._arrow.series import ArrowSeries @@ -141,33 +147,19 @@ def __getitem__( elif ( isinstance(item, tuple) and len(item) == 2 - and isinstance(item[1], (list, tuple)) + and is_sequence_but_not_str(item[1]) ): - if item[0] == slice(None): - selected_rows = self._native_frame - else: - range_ = convert_slice_to_nparray( - num_rows=len(self._native_frame), rows_slice=item[0] - ) - selected_rows = self._native_frame.take(range_) - + if len(item[1]) == 0: + # Return empty dataframe + return self._from_native_frame(self._native_frame.slice(0, 0).select([])) + selected_rows = select_rows(self._native_frame, item[0]) return self._from_native_frame(selected_rows.select(item[1])) elif isinstance(item, tuple) and len(item) == 2: if isinstance(item[1], slice): columns = self.columns if isinstance(item[1].start, str) or isinstance(item[1].stop, str): - start = ( - columns.index(item[1].start) - if item[1].start is not None - else None - ) - stop = ( - columns.index(item[1].stop) + 1 - if item[1].stop is not None - else None - ) - step = item[1].step + start, stop, step = convert_str_slice_to_int_slice(item[1], columns) return self._from_native_frame( self._native_frame.take(item[0]).select(columns[start:stop:step]) ) @@ -192,11 +184,9 @@ def __getitem__( name=col_name, backend_version=self._backend_version, ) - range_ = convert_slice_to_nparray( - num_rows=len(self._native_frame), rows_slice=item[0] - ) + selected_rows = select_rows(self._native_frame, item[0]) return ArrowSeries( - self._native_frame[col_name].take(range_), + selected_rows[col_name], name=col_name, backend_version=self._backend_version, ) @@ -205,15 +195,27 @@ def __getitem__( if item.step is not None and item.step != 1: msg = "Slicing with step is not supported on PyArrow tables" raise NotImplementedError(msg) + columns = self.columns + if isinstance(item.start, str) or isinstance(item.stop, str): + start, stop, step = convert_str_slice_to_int_slice(item, columns) + return self._from_native_frame( + self._native_frame.select(columns[start:stop:step]) + ) start = item.start or 0 - stop = item.stop or len(self._native_frame) + stop = item.stop if item.stop is not None else len(self._native_frame) return self._from_native_frame( - self._native_frame.slice(item.start, stop - start), + self._native_frame.slice(start, stop - start), ) elif isinstance(item, Sequence) or (is_numpy_array(item) and item.ndim == 1): - if isinstance(item, Sequence) and all(isinstance(x, str) for x in item): + if ( + isinstance(item, Sequence) + and all(isinstance(x, str) for x in item) + and len(item) > 0 + ): return self._from_native_frame(self._native_frame.select(item)) + if isinstance(item, Sequence) and len(item) == 0: + return self._from_native_frame(self._native_frame.slice(0, 0)) return self._from_native_frame(self._native_frame.take(item)) else: # pragma: no cover diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index a2a45586b..29bc3564a 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -8,6 +8,8 @@ from narwhals.utils import isinstance_or_issubclass if TYPE_CHECKING: + import pyarrow as pa + from narwhals._arrow.series import ArrowSeries @@ -286,3 +288,23 @@ def convert_slice_to_nparray( return np.arange(num_rows)[rows_slice] else: return rows_slice + + +def select_rows(table: pa.Table, rows: Any) -> pa.Table: + if isinstance(rows, slice) and rows == slice(None): + selected_rows = table + elif isinstance(rows, Sequence) and not rows: + selected_rows = table.slice(0, 0) + else: + range_ = convert_slice_to_nparray(num_rows=len(table), rows_slice=rows) + selected_rows = table.take(range_) + return selected_rows + + +def convert_str_slice_to_int_slice( + str_slice: slice, columns: list[str] +) -> tuple[int | None, int | None, int | None]: + start = columns.index(str_slice.start) if str_slice.start is not None else None + stop = columns.index(str_slice.stop) + 1 if str_slice.stop is not None else None + step = str_slice.step + return (start, stop, step) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 620670696..53481c2f2 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -11,6 +11,7 @@ from narwhals._expression_parsing import evaluate_into_exprs from narwhals._pandas_like.expr import PandasLikeExpr from narwhals._pandas_like.utils import broadcast_series +from narwhals._pandas_like.utils import convert_str_slice_to_int_slice from narwhals._pandas_like.utils import create_native_series from narwhals._pandas_like.utils import horizontal_concat from narwhals._pandas_like.utils import translate_dtype @@ -22,6 +23,7 @@ from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_unique_token +from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop if TYPE_CHECKING: @@ -119,6 +121,9 @@ def __getitem__(self, item: Sequence[str]) -> PandasLikeDataFrame: ... @overload def __getitem__(self, item: slice) -> PandasLikeDataFrame: ... + @overload + def __getitem__(self, item: tuple[slice, slice]) -> Self: ... + def __getitem__( self, item: str @@ -126,7 +131,8 @@ def __getitem__( | slice | Sequence[int] | Sequence[str] - | tuple[Sequence[int], str | int], + | tuple[Sequence[int], str | int] + | tuple[slice, slice], ) -> PandasLikeSeries | PandasLikeDataFrame: if isinstance(item, str): from narwhals._pandas_like.series import PandasLikeSeries @@ -140,16 +146,19 @@ def __getitem__( elif ( isinstance(item, tuple) and len(item) == 2 - and isinstance(item[1], (tuple, list)) + and is_sequence_but_not_str(item[1]) ): + if len(item[1]) == 0: + # Return empty dataframe + return self._from_native_frame(self._native_frame.__class__()) if all(isinstance(x, int) for x in item[1]): return self._from_native_frame(self._native_frame.iloc[item]) if all(isinstance(x, str) for x in item[1]): - item = ( + indexer = ( item[0], self._native_frame.columns.get_indexer(item[1]), ) - return self._from_native_frame(self._native_frame.iloc[item]) + return self._from_native_frame(self._native_frame.iloc[indexer]) msg = ( f"Expected sequence str or int, got: {type(item[1])}" # pragma: no cover ) @@ -158,15 +167,7 @@ def __getitem__( elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], slice): columns = self._native_frame.columns if isinstance(item[1].start, str) or isinstance(item[1].stop, str): - start = ( - columns.get_loc(item[1].start) if item[1].start is not None else None - ) - stop = ( - columns.get_loc(item[1].stop) + 1 - if item[1].stop is not None - else None - ) - step = item[1].step + start, stop, step = convert_str_slice_to_int_slice(item[1], columns) return self._from_native_frame( self._native_frame.iloc[item[0], slice(start, stop, step)] ) @@ -197,13 +198,21 @@ def __getitem__( backend_version=self._backend_version, ) - elif isinstance(item, (slice, Sequence)) or ( - is_numpy_array(item) and item.ndim == 1 - ): - if isinstance(item, Sequence) and all(isinstance(x, str) for x in item): + elif is_sequence_but_not_str(item) or (is_numpy_array(item) and item.ndim == 1): + if all(isinstance(x, str) for x in item) and len(item) > 0: return self._from_native_frame(self._native_frame.loc[:, item]) return self._from_native_frame(self._native_frame.iloc[item]) + elif isinstance(item, slice): + if isinstance(item.start, str) or isinstance(item.stop, str): + start, stop, step = convert_str_slice_to_int_slice( + item, self._native_frame.columns + ) + return self._from_native_frame( + self._native_frame.iloc[:, slice(start, stop, step)] + ) + return self._from_native_frame(self._native_frame.iloc[item]) + else: # pragma: no cover msg = f"Expected str or slice, got: {type(item)}" raise TypeError(msg) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 9e1d79ce9..bdcb71ef4 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -19,6 +19,7 @@ from narwhals.dtypes import DType ExprT = TypeVar("ExprT", bound=PandasLikeExpr) + import pandas as pd def validate_column_comparand(index: Any, other: Any) -> Any: @@ -497,3 +498,12 @@ def int_dtype_mapper(dtype: Any) -> str: if str(dtype).lower() != str(dtype): # pragma: no cover return "Int64" return "int64" + + +def convert_str_slice_to_int_slice( + str_slice: slice, columns: pd.Index +) -> tuple[int | None, int | None, int | None]: + start = columns.get_loc(str_slice.start) if str_slice.start is not None else None + stop = columns.get_loc(str_slice.stop) + 1 if str_slice.stop is not None else None + step = str_slice.step + return (start, stop, step) diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index c5164147a..9f30382da 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -4,10 +4,12 @@ from typing import Any from narwhals._polars.namespace import PolarsNamespace +from narwhals._polars.utils import convert_str_slice_to_int_slice from narwhals._polars.utils import extract_args_kwargs from narwhals._polars.utils import translate_dtype from narwhals.dependencies import get_polars from narwhals.utils import Implementation +from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop if TYPE_CHECKING: @@ -84,38 +86,52 @@ def shape(self) -> tuple[int, int]: return self._native_frame.shape # type: ignore[no-any-return] def __getitem__(self, item: Any) -> Any: - if isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], slice): + if self._backend_version >= (0, 20, 30): + return self._from_native_object(self._native_frame.__getitem__(item)) + else: # pragma: no cover # TODO(marco): we can delete this branch after Polars==0.20.30 becomes the minimum # Polars version we support columns = self.columns - if isinstance(item[1].start, str) or isinstance(item[1].stop, str): - start = ( - columns.index(item[1].start) if item[1].start is not None else None - ) - stop = ( - columns.index(item[1].stop) + 1 if item[1].stop is not None else None - ) - step = item[1].step - return self._from_native_frame( - self._native_frame.select(columns[start:stop:step]).__getitem__( - item[0] + if isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], slice): + if isinstance(item[1].start, str) or isinstance(item[1].stop, str): + start, stop, step = convert_str_slice_to_int_slice(item[1], columns) + return self._from_native_frame( + self._native_frame.select(columns[start:stop:step]).__getitem__( + item[0] + ) ) - ) - if isinstance(item[1].start, int) or isinstance(item[1].stop, int): + if isinstance(item[1].start, int) or isinstance(item[1].stop, int): + return self._from_native_frame( + self._native_frame.select( + columns[item[1].start : item[1].stop : item[1].step] + ).__getitem__(item[0]) + ) + msg = f"Expected slice of integers or strings, got: {type(item[1])}" # pragma: no cover + raise TypeError(msg) # pragma: no cover + pl = get_polars() + if ( + isinstance(item, tuple) + and (len(item) == 2) + and is_sequence_but_not_str(item[1]) + and (len(item[1]) == 0) + ): + result = self._native_frame.select(item[1]) + elif isinstance(item, slice) and ( + isinstance(item.start, str) or isinstance(item.stop, str) + ): + start, stop, step = convert_str_slice_to_int_slice(item, columns) return self._from_native_frame( - self._native_frame.select( - columns[item[1].start : item[1].stop : item[1].step] - ).__getitem__(item[0]) + self._native_frame.select(columns[start:stop:step]) ) - msg = f"Expected slice of integers or strings, got: {type(item[1])}" # pragma: no cover - raise TypeError(msg) # pragma: no cover - pl = get_polars() - result = self._native_frame.__getitem__(item) - if isinstance(result, pl.Series): - from narwhals._polars.series import PolarsSeries - - return PolarsSeries(result, backend_version=self._backend_version) - return self._from_native_object(result) + elif is_sequence_but_not_str(item) and (len(item) == 0): + result = self._native_frame.slice(0, 0) + else: + result = self._native_frame.__getitem__(item) + if isinstance(result, pl.Series): + from narwhals._polars.series import PolarsSeries + + return PolarsSeries(result, backend_version=self._backend_version) + return self._from_native_object(result) def get_column(self, name: str) -> Any: from narwhals._polars.series import PolarsSeries diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 51f0b1898..46f399a85 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -110,3 +110,12 @@ def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: if dtype == dtypes.Date: return pl.Date() return pl.Unknown() # pragma: no cover + + +def convert_str_slice_to_int_slice( + str_slice: slice, columns: list[str] +) -> tuple[int | None, int | None, int | None]: # pragma: no cover + start = columns.index(str_slice.start) if str_slice.start is not None else None + stop = columns.index(str_slice.stop) + 1 if str_slice.stop is not None else None + step = str_slice.step + return (start, stop, step) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index b51d53baa..72ecc36bd 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -15,6 +15,7 @@ from narwhals.dependencies import is_numpy_array from narwhals.schema import Schema from narwhals.utils import flatten +from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_version if TYPE_CHECKING: @@ -624,6 +625,9 @@ def __getitem__(self, item: Sequence[str]) -> Self: ... @overload def __getitem__(self, item: slice) -> Self: ... + @overload + def __getitem__(self, item: tuple[slice, slice]) -> Self: ... + def __getitem__( self, item: str @@ -632,7 +636,8 @@ def __getitem__( | Sequence[str] | tuple[Sequence[int], str | int] | tuple[slice, str | int] - | tuple[slice | Sequence[int], Sequence[int] | Sequence[str] | slice], + | tuple[slice | Sequence[int], Sequence[int] | Sequence[str] | slice] + | tuple[slice, slice], ) -> Series | Self: """ Extract column or slice of DataFrame. @@ -715,8 +720,12 @@ def __getitem__( if ( isinstance(item, tuple) and len(item) == 2 - and isinstance(item[1], (list, tuple, slice)) + and (is_sequence_but_not_str(item[1]) or isinstance(item[1], slice)) ): + if item[1] == slice(None) and item[0] == slice(None): + return self + if item[1] == slice(None): + return self._from_compliant_dataframe(self._compliant_frame[item[0]]) return self._from_compliant_dataframe(self._compliant_frame[item]) if isinstance(item, str) or (isinstance(item, tuple) and len(item) == 2): from narwhals.series import Series @@ -726,8 +735,10 @@ def __getitem__( level=self._level, ) - elif isinstance(item, (Sequence, slice)) or ( - is_numpy_array(item) and item.ndim == 1 + elif ( + is_sequence_but_not_str(item) + or isinstance(item, slice) + or (is_numpy_array(item) and item.ndim == 1) ): return self._from_compliant_dataframe(self._compliant_frame[item]) diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index fa98fd96f..1f103aae9 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -103,6 +103,9 @@ def __getitem__(self, item: Sequence[str]) -> Self: ... @overload def __getitem__(self, item: slice) -> Self: ... + @overload + def __getitem__(self, item: tuple[slice, slice]) -> Self: ... + def __getitem__(self, item: Any) -> Any: return _stableify(super().__getitem__(item)) diff --git a/narwhals/utils.py b/narwhals/utils.py index ec3c722d4..f9d340995 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -33,6 +33,7 @@ from types import ModuleType from typing_extensions import Self + from typing_extensions import TypeGuard from narwhals.dataframe import BaseFrame from narwhals.series import Series @@ -461,3 +462,7 @@ def parse_columns_to_drop( else: to_drop = list(cols.intersection(set(to_drop))) return to_drop + + +def is_sequence_but_not_str(sequence: Any) -> TypeGuard[Sequence[Any]]: + return isinstance(sequence, Sequence) and not isinstance(sequence, str) diff --git a/tests/frame/slice_test.py b/tests/frame/slice_test.py index 0867844f9..a832daa66 100644 --- a/tests/frame/slice_test.py +++ b/tests/frame/slice_test.py @@ -116,7 +116,7 @@ def test_slice_int_rows_str_columns(constructor_eager: Any) -> None: compare_dicts(result, expected) -def test_slice_slice_columns(constructor_eager: Any) -> None: +def test_slice_slice_columns(constructor_eager: Any) -> None: # noqa: PLR0915 data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9], "d": [1, 4, 2]} df = nw.from_native(constructor_eager(data), eager_only=True) result = df[[0, 1], "b":"c"] # type: ignore[misc] @@ -152,9 +152,29 @@ def test_slice_slice_columns(constructor_eager: Any) -> None: result = df[:2, [0, 2]] expected = {"a": [1, 2], "c": [7, 8]} compare_dicts(result, expected) + result = df[:2, ["a", "c"]] + expected = {"a": [1, 2], "c": [7, 8]} + compare_dicts(result, expected) + result = df[1:, [0, 2]] + expected = {"a": [2, 3], "c": [8, 9]} + compare_dicts(result, expected) + result = df[1:, ["a", "c"]] + expected = {"a": [2, 3], "c": [8, 9]} + compare_dicts(result, expected) result = df[["b", "c"]] expected = {"b": [4, 5, 6], "c": [7, 8, 9]} compare_dicts(result, expected) + result = df[:2] + expected = {"a": [1, 2], "b": [4, 5], "c": [7, 8], "d": [1, 4]} + compare_dicts(result, expected) + result = df[2:] + expected = {"a": [3], "b": [6], "c": [9], "d": [2]} + compare_dicts(result, expected) + # mypy says "Slice index must be an integer", but we do in fact support + # using string slices + result = df["a":"b"] # type: ignore[misc] + expected = {"a": [1, 2, 3], "b": [4, 5, 6]} + compare_dicts(result, expected) def test_slice_invalid(constructor_eager: Any) -> None: @@ -162,3 +182,14 @@ def test_slice_invalid(constructor_eager: Any) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) with pytest.raises(TypeError, match="Hint:"): df[0, 0] + + +def test_slice_edge_cases(constructor_eager: Any) -> None: + data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9], "d": [1, 4, 2]} + df = nw.from_native(constructor_eager(data), eager_only=True) + assert df[[], :].shape == (0, 4) + assert df[:, []].shape == (0, 0) + assert df[[]].shape == (0, 4) + assert df[[], ["a"]].shape == (0, 1) + assert df[:, :].shape == (3, 4) + assert df[[], []].shape == (0, 0) diff --git a/tests/series_only/slice_test.py b/tests/series_only/slice_test.py index 48cf15bc7..9ae194774 100644 --- a/tests/series_only/slice_test.py +++ b/tests/series_only/slice_test.py @@ -25,3 +25,9 @@ def test_slice(constructor_eager: Any) -> None: result = {"b": df[:2, 1]} expected = {"b": [4, 5]} compare_dicts(result, expected) + result = {"b": df[[0, 1], 1]} + expected = {"b": [4, 5]} + compare_dicts(result, expected) + result = {"b": df[[], 1]} + expected = {"b": []} + compare_dicts(result, expected) From 8b2b30b5117f879f9d7939a41db521258b650ba6 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 20 Sep 2024 11:14:09 +0200 Subject: [PATCH 007/145] docs: Restructure docs a bit (#1007) * restructure docs a bit * missing file * fix link --- docs/api-reference/index.md | 46 +++++++++++++--------------------- docs/extending.md | 48 ++++++++++++++++++++++++++++++++++- docs/installation.md | 50 ++++++++++++++++++++++++++++++++++++- docs/levels.md | 43 ------------------------------- docs/quick_start.md | 45 --------------------------------- docs/related.md | 13 ---------- docs/roadmap.md | 11 -------- docs/roadmap_and_related.md | 25 +++++++++++++++++++ mkdocs.yml | 13 ++++------ 9 files changed, 144 insertions(+), 150 deletions(-) delete mode 100644 docs/levels.md delete mode 100644 docs/quick_start.md delete mode 100644 docs/related.md delete mode 100644 docs/roadmap.md create mode 100644 docs/roadmap_and_related.md diff --git a/docs/api-reference/index.md b/docs/api-reference/index.md index 0c2c81fa3..b4cbc78fc 100644 --- a/docs/api-reference/index.md +++ b/docs/api-reference/index.md @@ -1,30 +1,20 @@ # API Reference -Anything documented in the API reference is intended to work consistently among -supported backends. - -For example: -```python -import narwhals as nw - -df.with_columns( - a_mean=nw.col("a").mean(), - a_std=nw.col("a").std(), -) -``` -is supported, as `DataFrame.with_columns`, `narwhals.col`, `Expr.mean`, and `Expr.std` are -all documented in the API reference. - -However, -```python -import narwhals as nw - -df.with_columns( - a_ewm_mean=nw.col("a").ewm_mean(alpha=0.7), -) -``` -is not - `Expr.ewm_mean` only appears in the Polars API reference, but not in the Narwhals -one. - -In general, you should expect any fundamental dataframe operation to be supported - if -one that you need is not, please do open a feature request! +- [Top-level functions](narwhals.md) +- [narwhals.DataFrame](dataframe.md) +- [narwhals.Expr](expr.md) +- [narwhals.Expr.cat](expr_cat.md) +- [narwhals.Expr.dt](expr_dt.md) +- [narwhals.Expr.name](expr_name.md) +- [narwhals.Expr.str](expr_str.md) +- [narwhals.GroupBy](group_by.md) +- [narwhals.LazyFrame](lazyframe.md) +- [narwhals.Schema](schema.md) +- [narwhals.Series](series.md) +- [narwhals.Series.cat](series_cat.md) +- [narwhals.Series.dt](series_dt.md) +- [narwhals.Series.str](series_str.md) +- [narwhals.dependencies](dependencies.md) +- [narwhals.dtypes](dtypes.md) +- [narwhals.selectors](selectors.md) +- [narwhals.typing](typing.md) diff --git a/docs/extending.md b/docs/extending.md index 1a750431f..13b93387c 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -1,4 +1,6 @@ -# List of supported libraries (and how to add yours!) +# Extending Narwhals, levels of support + +## List of supported libraries (and how to add yours!) Currently, Narwhals supports the following libraries as inputs: @@ -44,3 +46,47 @@ Make sure that, in addition to the public Narwhals API, you also define: Note that the "extension" mechanism is still experimental. If anything is not clear, or doesn't work, please do raise an issue or contact us on Discord (see the link on the README). + +## Levels + +Narwhals comes with two levels of support: "full" and "interchange". + +Libraries for which we have full support can benefit from the whole +[Narwhals API](https://narwhals-dev.github.io/narwhals/api-reference/). + +For example: + +```python exec="1" source="above" +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def func(df: FrameT) -> FrameT: + return df.group_by("a").agg( + b_mean=nw.col("b").mean(), + b_std=nw.col("b").std(), + ) +``` +will work for any of pandas, Polars, cuDF, Modin, and PyArrow. + +However, sometimes you don't need to do complex operations on dataframes - all you need +is to inspect the schema a bit before making other decisions, such as which columns to +select or whether to convert to another library. For that purpose, we also provide "interchange" +level of support. If a library implements the +[Dataframe Interchange Protocol](https://data-apis.org/dataframe-protocol/latest/), then +a call such as + +```python exec="1" source="above" +from typing import Any + +import narwhals as nw +from narwhals.schema import Schema + + +def func(df: Any) -> Schema: + df = nw.from_native(df, eager_or_interchange_only=True) + return df.schema +``` +is also supported, meaning that, in addition to the libraries mentioned above, you can +also pass Ibis, Vaex, PyArrow, and any other library which implements the protocol. diff --git a/docs/installation.md b/docs/installation.md index 617606817..6dc2bfa9d 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,4 +1,6 @@ -# Installation +# Installation and quick start + +## Installation First, make sure you have [created and activated](https://docs.python.org/3/library/venv.html) a Python3.8+ virtual environment. @@ -14,3 +16,49 @@ Then, if you start the Python REPL and see the following: '1.8.1' ``` then installation worked correctly! + +## Quick start + +### Prerequisites + +Please start by following the [installation instructions](installation.md). + +To follow along with the examples which follow, please install the following (though note that +they are not required dependencies - Narwhals only ever uses what the user passes in): + +- [pandas](https://pandas.pydata.org/docs/getting_started/install.html) +- [Polars](https://pola-rs.github.io/polars/user-guide/installation/) + +### Simple example + +Create a Python file `t.py` with the following content: + +```python exec="1" source="above" session="quickstart" result="python" +from __future__ import annotations + +import pandas as pd +import polars as pl +import narwhals as nw +from narwhals.typing import IntoFrame + + +def my_function(df_native: IntoFrame) -> list[str]: + df = nw.from_native(df_native) + column_names = df.columns + return column_names + + +df_pandas = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) +df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + +print("pandas output") +print(my_function(df_pandas)) +print("Polars output") +print(my_function(df_polars)) +``` + +If you run `python t.py` then your output should look like the above. This is the simplest possible example of a dataframe-agnostic +function - as we'll soon see, we can do much more advanced things. +Let's learn about what you just did, and what Narwhals can do for you! + +Note: these examples are only using pandas and Polars. Please see the following to find the [supported libriaries](extending.md). diff --git a/docs/levels.md b/docs/levels.md deleted file mode 100644 index 743334663..000000000 --- a/docs/levels.md +++ /dev/null @@ -1,43 +0,0 @@ -# Levels - -Narwhals comes with two levels of support: "full" and "interchange". - -Libraries for which we have full support can benefit from the whole -[Narwhals API](https://narwhals-dev.github.io/narwhals/api-reference/). - -For example: - -```python exec="1" source="above" -import narwhals as nw -from narwhals.typing import FrameT - - -@nw.narwhalify -def func(df: FrameT) -> FrameT: - return df.group_by("a").agg( - b_mean=nw.col("b").mean(), - b_std=nw.col("b").std(), - ) -``` -will work for any of pandas, Polars, cuDF, Modin, and PyArrow. - -However, sometimes you don't need to do complex operations on dataframes - all you need -is to inspect the schema a bit before making other decisions, such as which columns to -select or whether to convert to another library. For that purpose, we also provide "interchange" -level of support. If a library implements the -[Dataframe Interchange Protocol](https://data-apis.org/dataframe-protocol/latest/), then -a call such as - -```python exec="1" source="above" -from typing import Any - -import narwhals as nw -from narwhals.schema import Schema - - -def func(df: Any) -> Schema: - df = nw.from_native(df, eager_or_interchange_only=True) - return df.schema -``` -is also supported, meaning that, in addition to the libraries mentioned above, you can -also pass Ibis, Vaex, PyArrow, and any other library which implements the protocol. diff --git a/docs/quick_start.md b/docs/quick_start.md deleted file mode 100644 index f3ff8c05a..000000000 --- a/docs/quick_start.md +++ /dev/null @@ -1,45 +0,0 @@ -# Quick start - -## Prerequisites - -Please start by following the [installation instructions](installation.md). - -To follow along with the examples which follow, please install the following (though note that -they are not required dependencies - Narwhals only ever uses what the user passes in): - -- [pandas](https://pandas.pydata.org/docs/getting_started/install.html) -- [Polars](https://pola-rs.github.io/polars/user-guide/installation/) - -## Simple example - -Create a Python file `t.py` with the following content: - -```python exec="1" source="above" session="quickstart" result="python" -from __future__ import annotations - -import pandas as pd -import polars as pl -import narwhals as nw -from narwhals.typing import IntoFrame - - -def my_function(df_native: IntoFrame) -> list[str]: - df = nw.from_native(df_native) - column_names = df.columns - return column_names - - -df_pandas = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) -df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - -print("pandas output") -print(my_function(df_pandas)) -print("Polars output") -print(my_function(df_polars)) -``` - -If you run `python t.py` then your output should look like the above. This is the simplest possible example of a dataframe-agnostic -function - as we'll soon see, we can do much more advanced things. -Let's learn about what you just did, and what Narwhals can do for you! - -Note: these examples are only using pandas and Polars. Please see the following to find the [supported libriaries](extending.md). diff --git a/docs/related.md b/docs/related.md deleted file mode 100644 index 38b0522d8..000000000 --- a/docs/related.md +++ /dev/null @@ -1,13 +0,0 @@ -# Related projects - -## Dataframe Interchange Protocol - -Standardised way of interchanging data between libraries, see -[here](https://data-apis.org/dataframe-protocol/latest/index.html). - -Narwhals builds upon it by providing one level of support to libraries which implement it - -this includes Ibis and Vaex. See [levels](levels.md) for details. - -## Array API - -Array counterpart to the DataFrame API, see [here](https://data-apis.org/array-api/2022.12/index.html). diff --git a/docs/roadmap.md b/docs/roadmap.md deleted file mode 100644 index 87b224bf9..000000000 --- a/docs/roadmap.md +++ /dev/null @@ -1,11 +0,0 @@ -# Roadmap - -Priorities, as of August 2024, are: - -- Works towards supporting projects which have shown interest in Narwhals. -- Implement when/then/otherwise so that Narwhals is API-complete enough to complete all the TPC-H queries. -- Make Dask support complete-enough, at least to the point that it can execute TPC-H queries. -- Improve support for cuDF, which we can't currently test in CI (unless NVIDIA helps us out :wink:) but - which we can and do test manually in Kaggle notebooks. -- Add extra docs and tutorials to make the project more accessible and easy to get started with. -- Look into extra backends, such as DuckDB and Ibis. diff --git a/docs/roadmap_and_related.md b/docs/roadmap_and_related.md new file mode 100644 index 000000000..5e4646832 --- /dev/null +++ b/docs/roadmap_and_related.md @@ -0,0 +1,25 @@ +# Roadmap and related projects + +## Roadmap + +Priorities, as of September 2024, are: + +- Works towards supporting projects which have shown interest in Narwhals. +- Add extra docs and tutorials to make the project more accessible and easy to get started with. +- Improve support for cuDF, which we can't currently test in CI (unless NVIDIA helps us out :wink:) but + which we can and do test manually in Kaggle notebooks. +- Define a lazy-only layer of support which can include DuckDB, Ibis, and PySpark. + +## Related projects + +### Dataframe Interchange Protocol + +Standardised way of interchanging data between libraries, see +[here](https://data-apis.org/dataframe-protocol/latest/index.html). + +Narwhals builds upon it by providing one level of support to libraries which implement it - +this includes Ibis and Vaex. See [extending](extending.md) for details. + +### Array API + +Array counterpart to the DataFrame API, see [here](https://data-apis.org/array-api/2022.12/index.html). diff --git a/mkdocs.yml b/mkdocs.yml index 8b635f78d..b0aaf106a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -5,8 +5,7 @@ watch: nav: - Home: index.md - Why: why.md - - Installation: installation.md - - Quick start: quick_start.md + - Installation and quick start: installation.md - Tutorial: - basics/dataframe.md - basics/column.md @@ -15,18 +14,16 @@ nav: - other/pandas_index.md - other/user_warning.md - other/column_names.md - - levels.md - overhead.md - backcompat.md - extending.md - how_it_works.md - - Roadmap: roadmap.md - - Related projects: related.md + - Roadmap and related projects: roadmap_and_related.md - API Completeness: - api-completeness/index.md - - api-completeness/dataframe.md - - api-completeness/expr.md - - api-completeness/series.md + - Supported DataFrame methods: api-completeness/dataframe.md + - Supporteda Expr methods: api-completeness/expr.md + - Supported Series methods: api-completeness/series.md - API Reference: - api-reference/narwhals.md - api-reference/dataframe.md From d9756ea479f8564da9144f664295a803039d1664 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 20 Sep 2024 11:25:21 +0200 Subject: [PATCH 008/145] release: Bump version to 1.8.2 (#1009) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 6dc2bfa9d..ac7d23411 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.8.1' +'1.8.2' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 3c8b7b776..efc5cd8c1 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -53,7 +53,7 @@ from narwhals.utils import maybe_get_index from narwhals.utils import maybe_set_index -__version__ = "1.8.1" +__version__ = "1.8.2" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index efc02f9bb..0196124f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.8.1" +version = "1.8.2" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From 315b7baf32e2d7d8e9f8dece893bf004b0de8511 Mon Sep 17 00:00:00 2001 From: Tomer Gabay Date: Fri, 20 Sep 2024 14:09:57 +0200 Subject: [PATCH 009/145] chore: removing docs/generate_members.py (#1012) --- docs/generate_members.py | 43 ---------------------------------------- 1 file changed, 43 deletions(-) delete mode 100644 docs/generate_members.py diff --git a/docs/generate_members.py b/docs/generate_members.py deleted file mode 100644 index 85492d7d2..000000000 --- a/docs/generate_members.py +++ /dev/null @@ -1,43 +0,0 @@ -# ruff: noqa -# type: ignore -import sys - -sys.path.append("..") - -import pandas as pd -import polars as pl - -pd_series = pd.Series([1], name="a").__column_consortium_standard__() -pl_series = pl.Series("a", [1]).__column_consortium_standard__() -pd_df = pd.DataFrame({"a": [1]}).__dataframe_consortium_standard__() -pl_df = pl.DataFrame({"a": [1]}).__dataframe_consortium_standard__() -pd_scalar = pd_df.col("a").mean() -pl_scalar = pl_df.col("a").mean() -pd_namespace = pd_df.__dataframe_namespace__() -pl_namespace = pl_df.__dataframe_namespace__() - -for name, object in [ - ("pandas-column.md", pd_series), - ("polars-column.md", pl_series), - ("pandas-dataframe.md", pd_df), - ("polars-dataframe.md", pl_df), - ("pandas-scalar.md", pd_scalar), - ("polars-scalar.md", pl_scalar), - ("pandas-namespace.md", pd_scalar), - ("polars-namespace.md", pl_scalar), -]: - members = [ - i for i in object.__dir__() if not (i.startswith("_") and not i.startswith("__")) - ] - - with open(name) as fd: - content = fd.read() - - members_txt = "\n - ".join(sorted(members)) + "\n " - - start = content.index("members:") - end = content.index("show_signature") - content = content[:start] + f"members:\n - {members_txt}" + content[end:] - - with open(name, "w") as fd: - fd.write(content) From 55e71486a31c6b0713f59d6bd22dbefef9afae9c Mon Sep 17 00:00:00 2001 From: Magdalena Kowalczuk <74981211+anopsy@users.noreply.github.com> Date: Fri, 20 Sep 2024 14:31:53 +0200 Subject: [PATCH 010/145] add PyArrow usage example in docstrings `DataFrame.columns` (#1010) --- narwhals/dataframe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 72ecc36bd..16b869ff8 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -1046,10 +1046,12 @@ def columns(self) -> list[str]: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} >>> df_pd = pd.DataFrame(df) >>> df_pl = pl.DataFrame(df) + >>> df_pa = pa.table(df) We define a library agnostic function: @@ -1057,12 +1059,14 @@ def columns(self) -> list[str]: ... def func(df): ... return df.columns - We can then pass either pandas or Polars to `func`: + We can pass any supported library such as pandas, Polars, or PyArrow to `func`: >>> func(df_pd) ['foo', 'bar', 'ham'] >>> func(df_pl) ['foo', 'bar', 'ham'] + >>> func(df_pa) + ['foo', 'bar', 'ham'] """ return super().columns From 37b2516513150c7e50dcc2327679d649df8118c9 Mon Sep 17 00:00:00 2001 From: Morena <31620302+Morena1996@users.noreply.github.com> Date: Fri, 20 Sep 2024 14:52:27 +0200 Subject: [PATCH 011/145] docs: add pycapsule interface to related projects #1011 (#1017) * add section for pycapsule interface * add pycapsule description --- docs/roadmap_and_related.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/roadmap_and_related.md b/docs/roadmap_and_related.md index 5e4646832..43580db6f 100644 --- a/docs/roadmap_and_related.md +++ b/docs/roadmap_and_related.md @@ -23,3 +23,9 @@ this includes Ibis and Vaex. See [extending](extending.md) for details. ### Array API Array counterpart to the DataFrame API, see [here](https://data-apis.org/array-api/2022.12/index.html). + +### PyCapsule Interface + +Allows C extension modules to safely share pointers to C data structures with Python code and other C modules, encapsulating the pointer with a name and optional destructor to manage resources and ensure safe access, see [here](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) for details. + +Narwhals supports exporting a DataFrame via the Arrow PyCapsule Interface. From b779cd8a4e95e6159fe6d7d74f2a3104d58af2e6 Mon Sep 17 00:00:00 2001 From: Jeroen Janssens Date: Fri, 20 Sep 2024 14:58:56 +0200 Subject: [PATCH 012/145] Only run tests located in the tests directory (#1015) --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 0196124f6..f2109e117 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,6 +99,7 @@ force-single-line = true docstring-code-format = true [tool.pytest.ini_options] +testpaths = ["tests"] filterwarnings = [ "error", 'ignore:distutils Version classes are deprecated:DeprecationWarning', From e024da84b9e71ac5ddd26d3e91a561d7cb86b785 Mon Sep 17 00:00:00 2001 From: summerscope Date: Fri, 20 Sep 2024 15:02:57 +0200 Subject: [PATCH 013/145] Extensions page improvement (#1024) --- docs/extending.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/extending.md b/docs/extending.md index 13b93387c..f6829ba3f 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -4,11 +4,13 @@ Currently, Narwhals supports the following libraries as inputs: -- pandas -- Polars -- cuDF -- Modin -- PyArrow +| Library | 🔗 Link 🔗 | +| ------------- | ------------- | +| ️Polars 🐻‍❄️ | [github.com/pola-rs/polars](https://github.com/pola-rs/polars) | +| pandas 🐼 | [github.com/pandas-dev/pandas](https://github.com/pandas-dev/pandas) | +| cuDF | [github.com/rapidsai/cudf](https://github.com/rapidsai/cudf) | +| Modin | [github.com/modin-project/modin](https://github.com/modin-project/modin) | +| PyArrow ⇶ | [arrow.apache.org/docs/python](https://arrow.apache.org/docs/python/index.html) | If you want your own library to be recognised too, you're welcome open a PR (with tests)! Alternatively, if you can't do that (for example, if you library is closed-source), see From f69ef81fd2bf7682c523d001d4f5df1b9577b7f6 Mon Sep 17 00:00:00 2001 From: juanchodpg2 <126467473+juanchodpg2@users.noreply.github.com> Date: Fri, 20 Sep 2024 15:06:43 +0200 Subject: [PATCH 014/145] docs: Add pyarrow example to Installation guide (#1023) * Add pyarrow example to Installation guide * Create separate variable for data and use dictionary format for pyarrow table input --- docs/installation.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index ac7d23411..2f8012546 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -28,6 +28,7 @@ they are not required dependencies - Narwhals only ever uses what the user passe - [pandas](https://pandas.pydata.org/docs/getting_started/install.html) - [Polars](https://pola-rs.github.io/polars/user-guide/installation/) +- [PyArrow](https://arrow.apache.org/docs/python/install.html) ### Simple example @@ -38,6 +39,7 @@ from __future__ import annotations import pandas as pd import polars as pl +import pyarrow as pa import narwhals as nw from narwhals.typing import IntoFrame @@ -48,13 +50,19 @@ def my_function(df_native: IntoFrame) -> list[str]: return column_names -df_pandas = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) -df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) +data = {"a": [1, 2, 3], "b": [4, 5, 6]} +df_pandas = pd.DataFrame(data) +df_polars = pl.DataFrame(data) +table_pa = pa.table(data) print("pandas output") print(my_function(df_pandas)) + print("Polars output") print(my_function(df_polars)) + +print("PyArrow output") +print(my_function(table_pa)) ``` If you run `python t.py` then your output should look like the above. This is the simplest possible example of a dataframe-agnostic From 03780c84c0ced82079691efdac5229fe22149cad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathias=20Holmstr=C3=B8m?= Date: Fri, 20 Sep 2024 15:16:50 +0200 Subject: [PATCH 015/145] docs: pyarrow example in `DataFrame.to_pandas()` method (#1018) * docstrings pyarrow support for dataframe.to_pandas() * added eager to Polars --- narwhals/dataframe.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 16b869ff8..7ac21698e 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -395,14 +395,16 @@ def to_pandas(self) -> pd.DataFrame: Convert this DataFrame to a pandas DataFrame. Examples: - Construct pandas and Polars DataFrames: + Construct pandas, Polars (eager) and PyArrow DataFrames: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} >>> df_pd = pd.DataFrame(df) >>> df_pl = pl.DataFrame(df) + >>> df_pa = pa.table(df) We define a library agnostic function: @@ -410,7 +412,7 @@ def to_pandas(self) -> pd.DataFrame: ... def func(df): ... return df.to_pandas() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars (eager), or PyArrow to `func`: >>> func(df_pd) foo bar ham @@ -422,6 +424,13 @@ def to_pandas(self) -> pd.DataFrame: 0 1 6.0 a 1 2 7.0 b 2 3 8.0 c + >>> func(df_pa) + foo bar ham + 0 1 6.0 a + 1 2 7.0 b + 2 3 8.0 c + + """ return self._compliant_frame.to_pandas() From ab23d92e5765bda4f1f9601b2c8d4f7646efb97d Mon Sep 17 00:00:00 2001 From: Thomas Mendelin <36770664+thomend@users.noreply.github.com> Date: Fri, 20 Sep 2024 15:22:05 +0200 Subject: [PATCH 016/145] feat: adding to_native to Series (#1020) * added to native method for series * added docstring example * addedd test * addedd test * Update narwhals/series.py * fixed docstring in v1 * fixed docstring whitespaces --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- docs/api-reference/series.md | 1 + narwhals/series.py | 43 +++++++++++++++++++++++++++-- narwhals/stable/v1.py | 2 +- tests/series_only/to_native_test.py | 14 ++++++++++ utils/check_api_reference.py | 1 + 5 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 tests/series_only/to_native_test.py diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index c016b566d..9868e7b98 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -57,6 +57,7 @@ - to_list - to_numpy - to_pandas + - to_native - unique - value_counts - zip_with diff --git a/narwhals/series.py b/narwhals/series.py index 9d21058f5..f34217c99 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -22,7 +22,7 @@ class Series: """ Narwhals Series, backed by a native series. - The native dataframe might be pandas.Series, polars.Series, ... + The native series might be pandas.Series, polars.Series, ... This class is not meant to be instantiated directly - instead, use `narwhals.from_native`, making sure to pass `allow_series=True` or @@ -85,6 +85,45 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ca = pa.chunked_array([self.to_arrow()]) return ca.__arrow_c_stream__(requested_schema=requested_schema) + def to_native(self) -> Any: + """ + Convert Narwhals series to native series. + + Returns: + Series of class that user started with. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> s = [1, 2, 3] + >>> s_pd = pd.Series(s) + >>> s_pl = pl.Series(s) + + We define a library agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... return s.to_native() + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [i64] + [ + 1 + 2 + 3 + ] + """ + return self._compliant_series._native_series + def scatter(self, indices: int | Sequence[int], values: Any) -> Self: """ Set value(s) at given position(s). @@ -224,7 +263,7 @@ def __repr__(self) -> str: # pragma: no cover + "─" * length + "┐\n" + f"|{header}|\n" - + "| Use `narwhals.to_native()` to see native output |\n" + + "| Use `series.to_native()` to see native output |\n" + "└" + "─" * length + "┘" diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 1f103aae9..1cd731250 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -342,7 +342,7 @@ class Series(NwSeries): """ Narwhals Series, backed by a native series. - The native dataframe might be pandas.Series, polars.Series, ... + The native series might be pandas.Series, polars.Series, ... This class is not meant to be instantiated directly - instead, use `narwhals.from_native`, making sure to pass `allow_series=True` or diff --git a/tests/series_only/to_native_test.py b/tests/series_only/to_native_test.py new file mode 100644 index 000000000..269348ea3 --- /dev/null +++ b/tests/series_only/to_native_test.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +from typing import Any + +import narwhals.stable.v1 as nw + +data = [4, 4, 4, 1, 6, 6, 4, 4, 1, 1] + + +def test_to_native(constructor_eager: Any) -> None: + orig_series = constructor_eager({"a": data})["a"] + nw_series = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] + result = nw_series.to_native() + assert isinstance(result, orig_series.__class__) diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 1bf1f086e..0f95dfedf 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -27,6 +27,7 @@ "zip_with", "item", "scatter", + "to_native", } # TODO(Unassigned): make dtypes reference page as well From 8354b8e155c856bf7300169b6332cd33448f72b2 Mon Sep 17 00:00:00 2001 From: Matteo Renoldi Date: Fri, 20 Sep 2024 15:49:37 +0200 Subject: [PATCH 017/145] docs: include PyArrow in Tutorial/DataFrame (#1016) * docs: fix blank spaces markdown code violations * docs: fix more blank spaces issues * feat: add example one * feat: add example two * fix: no .collect() in pyarrow * fix: remove .collect() * refactor: better variable names --- docs/basics/dataframe.md | 44 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/basics/dataframe.md b/docs/basics/dataframe.md index 3cd2c884c..41be8aebf 100644 --- a/docs/basics/dataframe.md +++ b/docs/basics/dataframe.md @@ -4,7 +4,7 @@ To write a dataframe-agnostic function, the steps you'll want to follow are: 1. Initialise a Narwhals DataFrame or LazyFrame by passing your dataframe to `nw.from_native`. All the calculations stay lazy if we start with a lazy dataframe - Narwhals will never automatically trigger computation without you asking it to. - + Note: if you need eager execution, make sure to pass `eager_only=True` to `nw.from_native`. 2. Express your logic using the subset of the Polars API supported by Narwhals. @@ -21,6 +21,7 @@ Just like in Polars, we can pass expressions to `DataFrame.select` or `LazyFrame.select`. Make a Python file with the following content: + ```python exec="1" source="above" session="df_ex1" import narwhals as nw from narwhals.typing import FrameT @@ -34,6 +35,7 @@ def func(df: FrameT) -> FrameT: a_std=nw.col("a").std(), ) ``` + Let's try it out: === "pandas" @@ -60,7 +62,16 @@ Let's try it out: print(func(df).collect()) ``` +=== "PyArrow" + ```python exec="true" source="material-block" result="python" session="df_ex1" + import pyarrow as pa + + table = pa.table({"a": [1, 1, 2]}) + print(func(table)) + ``` + Alternatively, we could have opted for the more explicit version: + ```python import narwhals as nw from narwhals.typing import IntoFrameT @@ -75,6 +86,7 @@ def func(df_native: IntoFrameT) -> IntoFrameT: ) return nw.to_native(df) ``` + Despite being more verbose, it has the advantage of preserving the type annotation of the native object - see [typing](../api-reference/typing.md) for more details. @@ -84,6 +96,7 @@ In general, in this tutorial, we'll use the former. Just like in Polars, we can pass expressions to `GroupBy.agg`. Make a Python file with the following content: + ```python exec="1" source="above" session="df_ex2" import narwhals as nw from narwhals.typing import FrameT @@ -93,6 +106,7 @@ from narwhals.typing import FrameT def func(df: FrameT) -> FrameT: return df.group_by("a").agg(nw.col("b").mean()).sort("a") ``` + Let's try it out: === "pandas" @@ -119,12 +133,21 @@ Let's try it out: print(func(df).collect()) ``` +=== "PyArrow" + ```python exec="true" source="material-block" result="python" session="df_ex2" + import pyarrow as pa + + table = pa.table({"a": [1, 1, 2], "b": [4, 5, 6]}) + print(func(table)) + ``` + ## Example 3: horizontal sum Expressions can be free-standing functions which accept other expressions as inputs. For example, we can compute a horizontal sum using `nw.sum_horizontal`. Make a Python file with the following content: + ```python exec="1" source="above" session="df_ex3" import narwhals as nw from narwhals.typing import FrameT @@ -134,6 +157,7 @@ from narwhals.typing import FrameT def func(df: FrameT) -> FrameT: return df.with_columns(a_plus_b=nw.sum_horizontal("a", "b")) ``` + Let's try it out: === "pandas" @@ -160,6 +184,14 @@ Let's try it out: print(func(df).collect()) ``` +=== "PyArrow" + ```python exec="true" source="material-block" result="python" session="df_ex3" + import pyarrow as pa + + table = pa.table({"a": [1, 1, 2], "b": [4, 5, 6]}) + print(func(table)) + ``` + ## Example 4: multiple inputs `nw.narwhalify` can be used to decorate functions that take multiple inputs as well and @@ -169,6 +201,7 @@ For example, let's compute how many rows are left in a dataframe after filtering on a series. Make a Python file with the following content: + ```python exec="1" source="above" session="df_ex4" from typing import Any @@ -201,3 +234,12 @@ Let's try it out: s = pl.Series([1, 3]) print(func(df, s.to_numpy(), "a")) ``` + +=== "PyArrow" + ```python exec="true" source="material-block" result="python" session="df_ex4" + import pyarrow as pa + + table = pa.table({"a": [1, 1, 2, 2, 3], "b": [4, 5, 6, 7, 8]}) + a = pa.array([1, 3]) + print(func(table, a.to_numpy(), "a")) + ``` From d7c7a02f5f06322174544e9aa2e071c4ec20dbf7 Mon Sep 17 00:00:00 2001 From: Gauthier Piarrette Date: Fri, 20 Sep 2024 15:52:03 +0200 Subject: [PATCH 018/145] docs: Include PyArrow in Tutorial/Series (#1022) * rename column.md to series.md * update column.md to series.md in mkdocs.yml * include PyArrow in Tutorial/Series --------- Co-authored-by: Gauthier Piarrette --- docs/basics/{column.md => series.md} | 32 ++++++++++++++++++++++++++++ mkdocs.yml | 2 +- 2 files changed, 33 insertions(+), 1 deletion(-) rename docs/basics/{column.md => series.md} (84%) diff --git a/docs/basics/column.md b/docs/basics/series.md similarity index 84% rename from docs/basics/column.md rename to docs/basics/series.md index 62951cb4b..f5d8dee16 100644 --- a/docs/basics/column.md +++ b/docs/basics/series.md @@ -47,6 +47,14 @@ def my_func(df: FrameT) -> FrameT: print(my_func(df).collect()) ``` +=== "PyArrow" + ```python exec="true" source="material-block" result="python" session="ex1" + import pyarrow as pa + + table = pa.table({"a": [-1, 1, 3], "b": [3, 5, -3]}) + print(my_func(table)) + ``` + ## Example 2: multiply a column's values by a constant Let's write a dataframe-agnostic function which multiplies the values in column @@ -86,6 +94,14 @@ def my_func(df: FrameT) -> FrameT: print(my_func(df).collect()) ``` +=== "PyArrow" + ```python exec="true" source="material-block" result="python" session="ex2" + import pyarrow as pa + + table = pa.table({"a": [-1, 1, 3], "b": [3, 5, -3]}) + print(my_func(table)) + ``` + Note that column `'a'` was overwritten. If we had wanted to add a new column called `'c'` containing column `'a'`'s values multiplied by 2, we could have used `Expr.alias`: @@ -123,6 +139,14 @@ def my_func(df: FrameT) -> FrameT: print(my_func(df).collect()) ``` +=== "PyArrow" + ```python exec="true" source="material-block" result="python" session="ex2.1" + import pyarrow as pa + + table = pa.table({"a": [-1, 1, 3], "b": [3, 5, -3]}) + print(my_func(table)) + ``` + ## Example 3: finding the mean of a column as a scalar Now, we want to find the mean of column `'a'`, and we need it as a Python scalar. @@ -156,5 +180,13 @@ def my_func(df: nw.DataFrame) -> float | None: print(my_func(df)) ``` +=== "PyArrow" + ```python exec="true" source="material-block" result="python" session="ex2" + import pyarrow as pa + + table = pa.table({"a": [-1, 1, 3], "b": [3, 5, -3]}) + print(my_func(table)) + ``` + Note that, even though the output of our function is not a dataframe nor a series, we can still use `narwhalify`. diff --git a/mkdocs.yml b/mkdocs.yml index b0aaf106a..045a6a679 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -8,7 +8,7 @@ nav: - Installation and quick start: installation.md - Tutorial: - basics/dataframe.md - - basics/column.md + - basics/series.md - basics/complete_example.md - Pandas-like concepts: - other/pandas_index.md From 0341b4478cefaf9f246a123b7fba1844388f89ef Mon Sep 17 00:00:00 2001 From: juanchodpg2 <126467473+juanchodpg2@users.noreply.github.com> Date: Fri, 20 Sep 2024 22:12:41 +0200 Subject: [PATCH 019/145] Fix package dependency pyarrow in docs (#1029) --- .github/workflows/mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml index 37b188413..354dafd4c 100644 --- a/.github/workflows/mkdocs.yml +++ b/.github/workflows/mkdocs.yml @@ -27,6 +27,6 @@ jobs: path: .cache restore-keys: | mkdocs-material- - - run: pip install -r docs/requirements-docs.txt -e . pandas polars + - run: pip install -r docs/requirements-docs.txt -e . pandas polars pyarrow - run: mkdocs gh-deploy --force From 8badf556eb24f3b53baafcaa64cf5675f78036c2 Mon Sep 17 00:00:00 2001 From: Mike Weltevrede <46759318+mikeweltevrede@users.noreply.github.com> Date: Sat, 21 Sep 2024 12:25:44 +0200 Subject: [PATCH 020/145] docs: Update contributing guidelines (#1031) * Better explanation of tests * Add IntelliJ / Pycharm project folder to gitignore --- .gitignore | 3 +++ CONTRIBUTING.md | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3825a68a6..8b9adeb8f 100644 --- a/.gitignore +++ b/.gitignore @@ -32,5 +32,8 @@ tpch/data/* # VSCode .vscode/ +# IntelliJ IDEA / PyCharm +.idea/ + # MacOS .DS_Store diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f2ed79c62..250514743 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -91,7 +91,8 @@ If you add code that should be tested, please add tests. ### 6. Running tests To run tests, run `pytest`. To check coverage: `pytest --cov=narwhals`. -To run tests on the docset-module, use `pytest narwhals --doctest-modules`. +To run tests on the doctests, use `pytest narwhals --doctest-modules`. +To run unit tests and doctests at the same time, run `pytest tests narwhals --cov=narwhals --doctest-modules`. If you want to have less surprises when opening a PR, you can take advantage of [nox](https://nox.thea.codes/en/stable/index.html) to run the entire CI/CD test suite locally in your operating system. From 201bec08bb2368d5195579cff2cf07320adead26 Mon Sep 17 00:00:00 2001 From: Jukka Pajunen Date: Sat, 21 Sep 2024 12:28:46 +0200 Subject: [PATCH 021/145] Add pyarrow to_csv docstring (#1027) * Add pyarrow to_csv docstring * Update narwhals/dataframe.py Co-authored-by: Magdalena Kowalczuk <74981211+anopsy@users.noreply.github.com> --------- Co-authored-by: Magdalena Kowalczuk <74981211+anopsy@users.noreply.github.com> --- narwhals/dataframe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 7ac21698e..21c9c80bf 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -443,10 +443,12 @@ def write_csv(self, file: str | Path | BytesIO | None = None) -> Any: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} >>> df_pd = pd.DataFrame(df) >>> df_pl = pl.DataFrame(df) + >>> df_pa = pa.table(df) We define a library agnostic function: @@ -454,12 +456,14 @@ def write_csv(self, file: str | Path | BytesIO | None = None) -> Any: ... df = nw.from_native(df) ... return df.write_csv() - We can then pass either pandas or Polars to `func`: + We can pass any supported library such as pandas, Polars or PyArrow to `func`: >>> func(df_pd) # doctest: +SKIP 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n' >>> func(df_pl) # doctest: +SKIP 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n' + >>> func(df_pa) # doctest: +SKIP + 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n' If we had passed a file name to `write_csv`, it would have been written to that file. From 1686dd8c60f4c54b0e9d357337ae5c00f496aec8 Mon Sep 17 00:00:00 2001 From: Dosenpfand Date: Sat, 21 Sep 2024 12:31:37 +0200 Subject: [PATCH 022/145] docs: Add DataFrame.pipe to API completeness table (#1021) * Add DataFrame.pipe to API completeness table * Add pipe to all --------- Co-authored-by: Markus Gasser --- utils/generate_backend_completeness.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/utils/generate_backend_completeness.py b/utils/generate_backend_completeness.py index b2cb9df21..1d6ee4172 100644 --- a/utils/generate_backend_completeness.py +++ b/utils/generate_backend_completeness.py @@ -38,6 +38,8 @@ class Backend(NamedTuple): EXCLUDE_CLASSES = {"BaseFrame", "Then", "When"} +DIRECTLY_IMPLEMENTED_METHODS = ["pipe"] + def get_class_methods(kls: type[Any]) -> list[str]: return [m[0] for m in inspect.getmembers(kls) if not m[0].startswith("_")] @@ -50,7 +52,9 @@ def parse_module(module_name: str, backend: str, nw_class_name: str) -> list[str module_, predicate=lambda c: inspect.isclass(c) and c.__name__.endswith(nw_class_name), ) - methods_ = get_class_methods(class_[0][1]) if class_ else [] + methods_ = ( + get_class_methods(class_[0][1]) if class_ else [] + ) + DIRECTLY_IMPLEMENTED_METHODS except ModuleNotFoundError: methods_ = [] From b8e56ae63b1e0dc577ce7535f097c4f426fc3b63 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 21 Sep 2024 12:37:32 +0200 Subject: [PATCH 023/145] fix: polars getitem on polars==0.20.30 (#1032) --- narwhals/_polars/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 9f30382da..6f61853f3 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -86,7 +86,7 @@ def shape(self) -> tuple[int, int]: return self._native_frame.shape # type: ignore[no-any-return] def __getitem__(self, item: Any) -> Any: - if self._backend_version >= (0, 20, 30): + if self._backend_version > (0, 20, 30): return self._from_native_object(self._native_frame.__getitem__(item)) else: # pragma: no cover # TODO(marco): we can delete this branch after Polars==0.20.30 becomes the minimum From dac1cee7db871d1c2e172b33ae05ca242c87a5d0 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Sat, 21 Sep 2024 06:49:58 -0400 Subject: [PATCH 024/145] test: add specific check for cuDF for test_to_datetime (#995) * add specific check for cuDF for test_to_datetime * add pragma: no cover to tests --- tests/expr_and_series/str/to_datetime_test.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py index ad666aa8a..4eb768465 100644 --- a/tests/expr_and_series/str/to_datetime_test.py +++ b/tests/expr_and_series/str/to_datetime_test.py @@ -5,6 +5,11 @@ def test_to_datetime(constructor: Constructor) -> None: + if "cudf" in str(constructor): # pragma: no cover + expected = "2020-01-01T12:34:56.000000000" + else: + expected = "2020-01-01 12:34:56" + result = ( nw.from_native(constructor(data)) .lazy() @@ -12,4 +17,4 @@ def test_to_datetime(constructor: Constructor) -> None: .collect() .item(row=0, column="b") ) - assert str(result) == "2020-01-01 12:34:56" + assert str(result) == expected From 76f4edb93185924d69efb1093ed118e03f2fe7eb Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sat, 21 Sep 2024 12:51:46 +0200 Subject: [PATCH 025/145] docs: improve check api reference (#983) --- docs/api-reference/dtypes.md | 1 + utils/check_api_reference.py | 46 ++++++++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/docs/api-reference/dtypes.md b/docs/api-reference/dtypes.md index a607e9a54..c21b5c766 100644 --- a/docs/api-reference/dtypes.md +++ b/docs/api-reference/dtypes.md @@ -18,6 +18,7 @@ - Categorical - Enum - String + - Date - Datetime - Duration - Object diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 0f95dfedf..a56b28c58 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -29,9 +29,11 @@ "scatter", "to_native", } +BASE_DTYPES = {"NumericType", "DType", "TemporalType"} -# TODO(Unassigned): make dtypes reference page as well files = {remove_suffix(i, ".py") for i in os.listdir("narwhals")} + +# Top level functions top_level_functions = [ i for i in nw.__dir__() if not i[0].isupper() and i[0] != "_" and i not in files ] @@ -51,7 +53,8 @@ print(extra) # noqa: T201 ret = 1 -top_level_functions = [ +# DataFrame methods +dataframe_methods = [ i for i in nw.from_native(pl.DataFrame()).__dir__() if not i[0].isupper() and i[0] != "_" @@ -63,16 +66,17 @@ for i in content.splitlines() if i.startswith(" - ") and not i.startswith(" - _") ] -if missing := set(top_level_functions).difference(documented): +if missing := set(dataframe_methods).difference(documented): print("DataFrame: not documented") # noqa: T201 print(missing) # noqa: T201 ret = 1 -if extra := set(documented).difference(top_level_functions): +if extra := set(documented).difference(dataframe_methods): print("DataFrame: outdated") # noqa: T201 print(extra) # noqa: T201 ret = 1 -top_level_functions = [ +# LazyFrame methods +lazyframe_methods = [ i for i in nw.from_native(pl.LazyFrame()).__dir__() if not i[0].isupper() and i[0] != "_" @@ -84,16 +88,17 @@ for i in content.splitlines() if i.startswith(" - ") ] -if missing := set(top_level_functions).difference(documented): +if missing := set(lazyframe_methods).difference(documented): print("LazyFrame: not documented") # noqa: T201 print(missing) # noqa: T201 ret = 1 -if extra := set(documented).difference(top_level_functions): +if extra := set(documented).difference(lazyframe_methods): print("LazyFrame: outdated") # noqa: T201 print(extra) # noqa: T201 ret = 1 -top_level_functions = [ +# Series methods +series_methods = [ i for i in nw.from_native(pl.Series(), series_only=True).__dir__() if not i[0].isupper() and i[0] != "_" @@ -105,16 +110,17 @@ for i in content.splitlines() if i.startswith(" - ") and not i.startswith(" - _") ] -if missing := set(top_level_functions).difference(documented).difference(NAMESPACES): +if missing := set(series_methods).difference(documented).difference(NAMESPACES): print("Series: not documented") # noqa: T201 print(missing) # noqa: T201 ret = 1 -if extra := set(documented).difference(top_level_functions): +if extra := set(documented).difference(series_methods): print("Series: outdated") # noqa: T201 print(extra) # noqa: T201 ret = 1 -top_level_functions = [ +# Expr methods +expr_methods = [ i for i in nw.Expr(lambda: 0).__dir__() if not i[0].isupper() and i[0] != "_" ] with open("docs/api-reference/expr.md") as fd: @@ -124,16 +130,30 @@ for i in content.splitlines() if i.startswith(" - ") ] -if missing := set(top_level_functions).difference(documented).difference(NAMESPACES): +if missing := set(expr_methods).difference(documented).difference(NAMESPACES): print("Expr: not documented") # noqa: T201 print(missing) # noqa: T201 ret = 1 -if extra := set(documented).difference(top_level_functions): +if extra := set(documented).difference(expr_methods): print("Expr: outdated") # noqa: T201 print(extra) # noqa: T201 ret = 1 # DTypes +dtypes = [ + i for i in nw.dtypes.__dir__() if i[0].isupper() and not i.isupper() and i[0] != "_" +] +with open("docs/api-reference/dtypes.md") as fd: + content = fd.read() +documented = [ + remove_prefix(i, " - ") + for i in content.splitlines() + if i.startswith(" - ") and not i.startswith(" - _") +] +if missing := set(dtypes).difference(documented).difference(BASE_DTYPES): + print("Dtype: not documented") # noqa: T201 + print(missing) # noqa: T201 + ret = 1 # dt From cba61e154878e8b523bd02a3ff895a85db494281 Mon Sep 17 00:00:00 2001 From: Vahideh Alizadeh <82591913+V-Alizade@users.noreply.github.com> Date: Sat, 21 Sep 2024 14:50:19 +0200 Subject: [PATCH 026/145] docs: add pyarrow example for dataframe getitem and to_dict (#1033) * add pyarrow example for dataframe getitem and to_dict * update stable/v1.py * Update narwhals/stable/v1.py --------- Co-authored-by: Marco Edward Gorelli --- narwhals/dataframe.py | 19 +++++++++++++++++-- narwhals/stable/v1.py | 6 +++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 21c9c80bf..d100e8ef3 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -692,10 +692,12 @@ def __getitem__( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> data = {"a": [1, 2], "b": [3, 4]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -703,7 +705,7 @@ def __getitem__( ... def func(df): ... return df["a"] - We can then pass either pandas or Polars to `func`: + We can then pass either pandas, Polars or PyArrow to `func`: >>> func(df_pd) 0 1 @@ -716,6 +718,15 @@ def __getitem__( 1 2 ] + >>> func(df_pa) # doctest:+ELLIPSIS + + [ + [ + 1, + 2 + ] + ] + """ if isinstance(item, int): item = [item] @@ -781,6 +792,7 @@ def to_dict( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> df = { ... "A": [1, 2, 3, 4, 5], @@ -791,6 +803,7 @@ def to_dict( ... } >>> df_pd = pd.DataFrame(df) >>> df_pl = pl.DataFrame(df) + >>> df_pa = pa.table(df) We define a library agnostic function: @@ -798,12 +811,14 @@ def to_dict( ... def func(df): ... return df.to_dict(as_series=False) - We can then pass either pandas or Polars to `func`: + We can then pass either pandas, Polars or PyArrow to `func`: >>> func(df_pd) {'A': [1, 2, 3, 4, 5], 'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'], 'B': [5, 4, 3, 2, 1], 'animals': ['beetle', 'fly', 'beetle', 'beetle', 'beetle'], 'optional': [28.0, 300.0, nan, 2.0, -30.0]} >>> func(df_pl) {'A': [1, 2, 3, 4, 5], 'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'], 'B': [5, 4, 3, 2, 1], 'animals': ['beetle', 'fly', 'beetle', 'beetle', 'beetle'], 'optional': [28, 300, None, 2, -30]} + >>> func(df_pa) + {'A': [1, 2, 3, 4, 5], 'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'], 'B': [5, 4, 3, 2, 1], 'animals': ['beetle', 'fly', 'beetle', 'beetle', 'beetle'], 'optional': [28, 300, None, 2, -30]} """ from narwhals.series import Series diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 1cd731250..676b30f2a 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -164,6 +164,7 @@ def to_dict( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals.stable.v1 as nw >>> df = { ... "A": [1, 2, 3, 4, 5], @@ -174,6 +175,7 @@ def to_dict( ... } >>> df_pd = pd.DataFrame(df) >>> df_pl = pl.DataFrame(df) + >>> df_pa = pa.table(df) We define a library agnostic function: @@ -181,12 +183,14 @@ def to_dict( ... def func(df): ... return df.to_dict(as_series=False) - We can then pass either pandas or Polars to `func`: + We can then pass either pandas, Polars or PyArrow to `func`: >>> func(df_pd) {'A': [1, 2, 3, 4, 5], 'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'], 'B': [5, 4, 3, 2, 1], 'animals': ['beetle', 'fly', 'beetle', 'beetle', 'beetle'], 'optional': [28.0, 300.0, nan, 2.0, -30.0]} >>> func(df_pl) {'A': [1, 2, 3, 4, 5], 'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'], 'B': [5, 4, 3, 2, 1], 'animals': ['beetle', 'fly', 'beetle', 'beetle', 'beetle'], 'optional': [28, 300, None, 2, -30]} + >>> func(df_pa) + {'A': [1, 2, 3, 4, 5], 'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'], 'B': [5, 4, 3, 2, 1], 'animals': ['beetle', 'fly', 'beetle', 'beetle', 'beetle'], 'optional': [28, 300, None, 2, -30]} """ if as_series: return {key: _stableify(value) for key, value in super().to_dict().items()} From d9c0bce73c304ad6159e05784b4612b4dd31a9c3 Mon Sep 17 00:00:00 2001 From: Cheuk Ting Ho Date: Sat, 21 Sep 2024 14:32:25 +0100 Subject: [PATCH 027/145] feat: adding to_native to dataframe (#1014) --- docs/api-reference/dataframe.md | 1 + narwhals/dataframe.py | 52 +++++++++++++++++++++++++++++++-- tests/frame/to_native_test.py | 11 +++++++ 3 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 tests/frame/to_native_test.py diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index fe0c4025d..f593293f4 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -37,6 +37,7 @@ - tail - to_arrow - to_dict + - to_native - to_numpy - to_pandas - unique diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index d100e8ef3..2a7a94cf8 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -30,10 +30,10 @@ from narwhals.group_by import GroupBy from narwhals.group_by import LazyGroupBy from narwhals.series import Series - from narwhals.typing import IntoDataFrame from narwhals.typing import IntoExpr + from narwhals.typing import IntoFrame -FrameT = TypeVar("FrameT", bound="IntoDataFrame") +FrameT = TypeVar("FrameT", bound="IntoFrame") class BaseFrame(Generic[FrameT]): @@ -390,6 +390,54 @@ def lazy(self) -> LazyFrame[Any]: """ return super().lazy() + def to_native(self) -> FrameT: + """ + Convert Narwhals DataFrame to native one. + + Returns: + Object of class that user started with. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Calling `to_native` on a Narwhals DataFrame returns the native object: + + >>> nw.from_native(df_pd).to_native() + foo bar ham + 0 1 6.0 a + 1 2 7.0 b + 2 3 8.0 c + >>> nw.from_native(df_pl).to_native() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + >>> nw.from_native(df_pa).to_native() + pyarrow.Table + foo: int64 + bar: double + ham: string + ---- + foo: [[1,2,3]] + bar: [[6,7,8]] + ham: [["a","b","c"]] + """ + + return self._compliant_frame._native_frame # type: ignore[no-any-return] + def to_pandas(self) -> pd.DataFrame: """ Convert this DataFrame to a pandas DataFrame. diff --git a/tests/frame/to_native_test.py b/tests/frame/to_native_test.py new file mode 100644 index 000000000..236403e23 --- /dev/null +++ b/tests/frame/to_native_test.py @@ -0,0 +1,11 @@ +from typing import Any + +import narwhals.stable.v1 as nw + + +def test_to_native(constructor_eager: Any) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]} + df_raw = constructor_eager(data) + df = nw.from_native(df_raw, eager_only=True) + + assert isinstance(df.to_native(), df_raw.__class__) From 47dae475d227e3d031560c0c5aee02245b40cda7 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sat, 21 Sep 2024 15:35:10 +0200 Subject: [PATCH 028/145] perf: low-hanging overhead (#1034) --- narwhals/_arrow/selectors.py | 6 ++-- narwhals/_arrow/utils.py | 3 +- narwhals/_dask/selectors.py | 6 ++-- narwhals/_dask/utils.py | 13 +++----- narwhals/_expression_parsing.py | 2 +- narwhals/_pandas_like/group_by.py | 3 +- narwhals/_pandas_like/selectors.py | 6 ++-- narwhals/_pandas_like/utils.py | 53 +++++++++++++----------------- narwhals/translate.py | 2 +- narwhals/utils.py | 4 +-- 10 files changed, 44 insertions(+), 54 deletions(-) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index f757ffede..569724c45 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -99,7 +99,7 @@ def __sub__(self: Self, other: Self | Any) -> ArrowSelector | Any: def call(df: ArrowDataFrame) -> list[ArrowSeries]: lhs = self._call(df) rhs = other._call(df) - return [x for x in lhs if x.name not in [x.name for x in rhs]] + return [x for x in lhs if x.name not in {x.name for x in rhs}] return ArrowSelector( call, @@ -118,7 +118,7 @@ def __or__(self: Self, other: Self | Any) -> ArrowSelector | Any: def call(df: ArrowDataFrame) -> list[ArrowSeries]: lhs = self._call(df) rhs = other._call(df) - return [x for x in lhs if x.name not in [x.name for x in rhs]] + rhs + return [x for x in lhs if x.name not in {x.name for x in rhs}] + rhs return ArrowSelector( call, @@ -137,7 +137,7 @@ def __and__(self: Self, other: Self | Any) -> ArrowSelector | Any: def call(df: ArrowDataFrame) -> list[ArrowSeries]: lhs = self._call(df) rhs = other._call(df) - return [x for x in lhs if x.name in [x.name for x in rhs]] + return [x for x in lhs if x.name in {x.name for x in rhs}] return ArrowSelector( call, diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 29bc3564a..ddf7a8639 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -265,10 +265,11 @@ def broadcast_series(series: list[ArrowSeries]) -> list[Any]: import pyarrow as pa # ignore-banned-import + is_max_length_gt_1 = max_length > 1 reshaped = [] for s, length in zip(series, lengths): s_native = s._native_series - if max_length > 1 and length == 1: + if is_max_length_gt_1 and length == 1: value = s_native[0] if s._backend_version < (13,) and hasattr(value, "as_py"): # pragma: no cover value = value.as_py() diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index d3525f71f..54131a8a5 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -102,7 +102,7 @@ def __sub__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: def call(df: DaskLazyFrame) -> list[Any]: lhs = self._call(df) rhs = other._call(df) - return [x for x in lhs if x.name not in [x.name for x in rhs]] + return [x for x in lhs if x.name not in {x.name for x in rhs}] return DaskSelector( call, @@ -122,7 +122,7 @@ def __or__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: def call(df: DaskLazyFrame) -> list[dask_expr.Series]: lhs = self._call(df) rhs = other._call(df) - return [x for x in lhs if x.name not in [x.name for x in rhs]] + rhs + return [x for x in lhs if x.name not in {x.name for x in rhs}] + rhs return DaskSelector( call, @@ -142,7 +142,7 @@ def __and__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: def call(df: DaskLazyFrame) -> list[Any]: lhs = self._call(df) rhs = other._call(df) - return [x for x in lhs if x.name in [x.name for x in rhs]] + return [x for x in lhs if x.name in {x.name for x in rhs}] return DaskSelector( call, diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index 1f5cda4ba..274044979 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -45,21 +45,18 @@ def parse_exprs_and_named_exprs( else: # pragma: no cover msg = f"Expected expression or column name, got: {expr}" raise TypeError(msg) + return_scalar = getattr(expr, "_returns_scalar", False) for _result in _results: - if getattr(expr, "_returns_scalar", False): - results[_result.name] = _result[0] - else: - results[_result.name] = _result + results[_result.name] = _result[0] if return_scalar else _result + for name, value in named_exprs.items(): _results = value._call(df) if len(_results) != 1: # pragma: no cover msg = "Named expressions must return a single column" raise AssertionError(msg) + return_scalar = getattr(value, "_returns_scalar", False) for _result in _results: - if getattr(value, "_returns_scalar", False): - results[name] = _result[0] - else: - results[name] = _result + results[name] = _result[0] if return_scalar else _result return results diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index a74ca3c63..10d686ab1 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -95,7 +95,7 @@ def evaluate_into_exprs( """Evaluate each expr into Series.""" series: ListOfCompliantSeries = [ # type: ignore[assignment] item - for sublist in [evaluate_into_expr(df, into_expr) for into_expr in flatten(exprs)] + for sublist in (evaluate_into_expr(df, into_expr) for into_expr in flatten(exprs)) for item in sublist ] for name, expr in named_exprs.items(): diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 892291d57..a0ee03f7d 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -161,8 +161,9 @@ def agg_pandas( # noqa: PLR0915 function_name = POLARS_TO_PANDAS_AGGREGATIONS.get( function_name, function_name ) + is_n_unique = function_name == "nunique" for root_name, output_name in zip(expr._root_names, expr._output_names): - if function_name == "nunique": + if is_n_unique: nunique_aggs[output_name] = root_name else: simple_aggregations[output_name] = (root_name, function_name) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 8dd9ceae5..1214e12fc 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -102,7 +102,7 @@ def __sub__(self, other: PandasSelector | Any) -> PandasSelector | Any: def call(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: lhs = self._call(df) rhs = other._call(df) - return [x for x in lhs if x.name not in [x.name for x in rhs]] + return [x for x in lhs if x.name not in {x.name for x in rhs}] return PandasSelector( call, @@ -122,7 +122,7 @@ def __or__(self, other: PandasSelector | Any) -> PandasSelector | Any: def call(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: lhs = self._call(df) rhs = other._call(df) - return [x for x in lhs if x.name not in [x.name for x in rhs]] + rhs + return [x for x in lhs if x.name not in {x.name for x in rhs}] + rhs return PandasSelector( call, @@ -142,7 +142,7 @@ def __and__(self, other: PandasSelector | Any) -> PandasSelector | Any: def call(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: lhs = self._call(df) rhs = other._call(df) - return [x for x in lhs if x.name in [x.name for x in rhs]] + return [x for x in lhs if x.name in {x.name for x in rhs}] return PandasSelector( call, diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index bdcb71ef4..a65e361a3 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -221,63 +221,54 @@ def set_axis( def translate_dtype(column: Any) -> DType: from narwhals import dtypes - dtype = column.dtype - if str(dtype) in ("int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"): + dtype = str(column.dtype) + if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: return dtypes.Int64() - if str(dtype) in ("int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"): + if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}: return dtypes.Int32() - if str(dtype) in ("int16", "Int16", "Int16[pyarrow]", "int16[pyarrow]"): + if dtype in {"int16", "Int16", "Int16[pyarrow]", "int16[pyarrow]"}: return dtypes.Int16() - if str(dtype) in ("int8", "Int8", "Int8[pyarrow]", "int8[pyarrow]"): + if dtype in {"int8", "Int8", "Int8[pyarrow]", "int8[pyarrow]"}: return dtypes.Int8() - if str(dtype) in ("uint64", "UInt64", "UInt64[pyarrow]", "uint64[pyarrow]"): + if dtype in {"uint64", "UInt64", "UInt64[pyarrow]", "uint64[pyarrow]"}: return dtypes.UInt64() - if str(dtype) in ("uint32", "UInt32", "UInt32[pyarrow]", "uint32[pyarrow]"): + if dtype in {"uint32", "UInt32", "UInt32[pyarrow]", "uint32[pyarrow]"}: return dtypes.UInt32() - if str(dtype) in ("uint16", "UInt16", "UInt16[pyarrow]", "uint16[pyarrow]"): + if dtype in {"uint16", "UInt16", "UInt16[pyarrow]", "uint16[pyarrow]"}: return dtypes.UInt16() - if str(dtype) in ("uint8", "UInt8", "UInt8[pyarrow]", "uint8[pyarrow]"): + if dtype in {"uint8", "UInt8", "UInt8[pyarrow]", "uint8[pyarrow]"}: return dtypes.UInt8() - if str(dtype) in ( + if dtype in { "float64", "Float64", "Float64[pyarrow]", "float64[pyarrow]", "double[pyarrow]", - ): + }: return dtypes.Float64() - if str(dtype) in ( + if dtype in { "float32", "Float32", "Float32[pyarrow]", "float32[pyarrow]", "float[pyarrow]", - ): + }: return dtypes.Float32() - if str(dtype) in ( - "string", - "string[python]", - "string[pyarrow]", - "large_string[pyarrow]", - ): + if dtype in {"string", "string[python]", "string[pyarrow]", "large_string[pyarrow]"}: return dtypes.String() - if str(dtype) in ("bool", "boolean", "boolean[pyarrow]", "bool[pyarrow]"): + if dtype in {"bool", "boolean", "boolean[pyarrow]", "bool[pyarrow]"}: return dtypes.Boolean() - if str(dtype) in ("category",) or str(dtype).startswith("dictionary<"): + if dtype == "category" or dtype.startswith("dictionary<"): return dtypes.Categorical() - if str(dtype).startswith("datetime64"): + if dtype.startswith(("datetime64", "timestamp[")): # TODO(Unassigned): different time units and time zones return dtypes.Datetime() - if str(dtype).startswith("timedelta64") or str(dtype).startswith("duration"): + if dtype.startswith(("timedelta64", "duration")): # TODO(Unassigned): different time units return dtypes.Duration() - if str(dtype).startswith("timestamp["): - # pyarrow-backed datetime - # TODO(Unassigned): different time units and time zones - return dtypes.Datetime() - if str(dtype) == "date32[day][pyarrow]": + if dtype == "date32[day][pyarrow]": return dtypes.Date() - if str(dtype) == "object": + if dtype == "object": if ( # pragma: no cover TODO(unassigned): why does this show as uncovered? idx := getattr(column, "first_valid_index", lambda: None)() ) is not None and isinstance(column.loc[idx], str): @@ -455,10 +446,10 @@ def broadcast_series(series: list[PandasLikeSeries]) -> list[Any]: idx = series[lengths.index(max_length)]._native_series.index reindexed = [] - + max_length_gt_1 = max_length > 1 for s, length in zip(series, lengths): s_native = s._native_series - if max_length > 1 and length == 1: + if max_length_gt_1 and length == 1: reindexed.append( native_namespace.Series( [s_native.iloc[0]] * max_length, diff --git a/narwhals/translate.py b/narwhals/translate.py index 7b7d09de5..0e7706fb7 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -703,7 +703,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: backends = { b() - for v in [*args, *kwargs.values()] + for v in (*args, *kwargs.values()) if (b := getattr(v, "__native_namespace__", None)) } diff --git a/narwhals/utils.py b/narwhals/utils.py index f9d340995..15919a922 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1,9 +1,9 @@ from __future__ import annotations import re -import secrets from enum import Enum from enum import auto +from secrets import token_hex from typing import TYPE_CHECKING from typing import Any from typing import Iterable @@ -433,7 +433,7 @@ def generate_unique_token(n_bytes: int, columns: list[str]) -> str: # pragma: n """ counter = 0 while True: - token = secrets.token_hex(n_bytes) + token = token_hex(n_bytes) if token not in columns: return token From 967f6af2b2f3b1603dc0327f11ff152547d89422 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sat, 21 Sep 2024 15:48:20 +0200 Subject: [PATCH 029/145] fix: docs generate backend table (#1035) --- utils/generate_backend_completeness.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/utils/generate_backend_completeness.py b/utils/generate_backend_completeness.py index 1d6ee4172..7f2e23ca2 100644 --- a/utils/generate_backend_completeness.py +++ b/utils/generate_backend_completeness.py @@ -53,8 +53,10 @@ def parse_module(module_name: str, backend: str, nw_class_name: str) -> list[str predicate=lambda c: inspect.isclass(c) and c.__name__.endswith(nw_class_name), ) methods_ = ( - get_class_methods(class_[0][1]) if class_ else [] - ) + DIRECTLY_IMPLEMENTED_METHODS + get_class_methods(class_[0][1]) + DIRECTLY_IMPLEMENTED_METHODS + if class_ + else [] + ) except ModuleNotFoundError: methods_ = [] From 1cef1f03902dd36a763c3795759fb8ee9004c005 Mon Sep 17 00:00:00 2001 From: Heitor Cunha Silva <31899901+HeitCunha@users.noreply.github.com> Date: Sat, 21 Sep 2024 15:54:59 +0200 Subject: [PATCH 030/145] Implementing to_native in lazyframe (#1025) --- docs/api-reference/lazyframe.md | 1 + narwhals/dataframe.py | 58 ++++++++++++++++++++++++++++----- narwhals/stable/v1.py | 10 +++--- tests/frame/to_native_test.py | 6 ++-- 4 files changed, 58 insertions(+), 17 deletions(-) diff --git a/docs/api-reference/lazyframe.md b/docs/api-reference/lazyframe.md index 5d472bab6..89a8282ac 100644 --- a/docs/api-reference/lazyframe.md +++ b/docs/api-reference/lazyframe.md @@ -23,6 +23,7 @@ - select - sort - tail + - to_native - unique - with_columns - with_row_index diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 2a7a94cf8..83ebdcdc5 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -14,6 +14,7 @@ from narwhals.dependencies import get_polars from narwhals.dependencies import is_numpy_array from narwhals.schema import Schema +from narwhals.translate import to_native from narwhals.utils import flatten from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_version @@ -319,14 +320,14 @@ def __array__(self, dtype: Any = None, copy: bool | None = None) -> np.ndarray: return self._compliant_frame.__array__(dtype, copy=copy) def __repr__(self) -> str: # pragma: no cover - header = " Narwhals DataFrame " + header = " Narwhals DataFrame " length = len(header) return ( "┌" + "─" * length + "┐\n" + f"|{header}|\n" - + "| Use `narwhals.to_native` to see native output |\n" + + "| Use `.to_native` to see native output |\n" + "└" + "─" * length + "┘" @@ -2640,14 +2641,14 @@ def __init__( raise AssertionError(msg) def __repr__(self) -> str: # pragma: no cover - header = " Narwhals LazyFrame " + header = " Narwhals LazyFrame " length = len(header) return ( "┌" + "─" * length + "┐\n" + f"|{header}|\n" - + "| Use `narwhals.to_native` to see native output |\n" + + "| Use `.to_native` to see native output |\n" + "└" + "─" * length + "┘" @@ -2676,12 +2677,12 @@ def collect(self) -> DataFrame[Any]: ... ) >>> lf = nw.from_native(lf_pl) >>> lf - ┌───────────────────────────────────────────────┐ - | Narwhals LazyFrame | - | Use `narwhals.to_native` to see native output | - └───────────────────────────────────────────────┘ + ┌───────────────────────────────────────┐ + | Narwhals LazyFrame | + | Use `.to_native` to see native output | + └───────────────────────────────────────┘ >>> df = lf.group_by("a").agg(nw.all().sum()).collect() - >>> nw.to_native(df).sort("a") + >>> df.to_native().sort("a") shape: (3, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -2698,6 +2699,45 @@ def collect(self) -> DataFrame[Any]: level=self._level, ) + def to_native(self) -> FrameT: + """ + Convert Narwhals LazyFrame to native one. + + Returns: + Object of class that user started with. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.LazyFrame(data) + >>> df_pa = pa.table(data) + + Calling `to_native` on a Narwhals DataFrame returns the native object: + + >>> nw.from_native(df_pd).lazy().to_native() + foo bar ham + 0 1 6.0 a + 1 2 7.0 b + 2 3 8.0 c + >>> nw.from_native(df_pl).to_native().collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + """ + + return to_native(narwhals_object=self, strict=True) + # inherited def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Self: """ diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 676b30f2a..0f34c88fb 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -322,12 +322,12 @@ def collect(self) -> DataFrame[Any]: ... ) >>> lf = nw.from_native(lf_pl) >>> lf - ┌───────────────────────────────────────────────┐ - | Narwhals LazyFrame | - | Use `narwhals.to_native` to see native output | - └───────────────────────────────────────────────┘ + ┌───────────────────────────────────────┐ + | Narwhals LazyFrame | + | Use `.to_native` to see native output | + └───────────────────────────────────────┘ >>> df = lf.group_by("a").agg(nw.all().sum()).collect() - >>> nw.to_native(df).sort("a") + >>> df.to_native().sort("a") shape: (3, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ diff --git a/tests/frame/to_native_test.py b/tests/frame/to_native_test.py index 236403e23..d8f4132bf 100644 --- a/tests/frame/to_native_test.py +++ b/tests/frame/to_native_test.py @@ -3,9 +3,9 @@ import narwhals.stable.v1 as nw -def test_to_native(constructor_eager: Any) -> None: +def test_to_native(constructor: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]} - df_raw = constructor_eager(data) - df = nw.from_native(df_raw, eager_only=True) + df_raw = constructor(data) + df = nw.from_native(df_raw) assert isinstance(df.to_native(), df_raw.__class__) From 58d8535ec9737b5ae0623a1eb5aa22b4eaa2ece0 Mon Sep 17 00:00:00 2001 From: Thijs Nieuwdorp Date: Sat, 21 Sep 2024 16:05:15 +0200 Subject: [PATCH 031/145] test: use pytest-randomly, recommend pytest-xdist in docs (#1019) * add pytest-xdist dep * Set distributed as standard pytest option * add docs note, use pytest-randomly --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- CONTRIBUTING.md | 7 ++++--- requirements-dev.txt | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 250514743..a30273970 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -90,9 +90,10 @@ If you add code that should be tested, please add tests. ### 6. Running tests -To run tests, run `pytest`. To check coverage: `pytest --cov=narwhals`. -To run tests on the doctests, use `pytest narwhals --doctest-modules`. -To run unit tests and doctests at the same time, run `pytest tests narwhals --cov=narwhals --doctest-modules`. +- To run tests, run `pytest`. To check coverage: `pytest --cov=narwhals` +- To run tests on the doctests, use `pytest narwhals --doctest-modules` +- To run unit tests and doctests at the same time, run `pytest tests narwhals --cov=narwhals --doctest-modules` +- To run tests multiprocessed, you may also want to use [pytest-xdist](https://github.com/pytest-dev/pytest-xdist) (optional) If you want to have less surprises when opening a PR, you can take advantage of [nox](https://nox.thea.codes/en/stable/index.html) to run the entire CI/CD test suite locally in your operating system. diff --git a/requirements-dev.txt b/requirements-dev.txt index 44d57530d..2158f0821 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,6 +7,7 @@ pre-commit pyarrow pytest pytest-cov +pytest-randomly pytest-env hypothesis scikit-learn From 6b6fefd524ee0eafa4b23247f1a5e4a90d78d138 Mon Sep 17 00:00:00 2001 From: Mike Weltevrede <46759318+mikeweltevrede@users.noreply.github.com> Date: Sat, 21 Sep 2024 16:15:49 +0200 Subject: [PATCH 032/145] fix: tuple supported for getitem for Pandas dataframes (#1026) * Set up tests * Initial commit * Fix typo * Expand docs with running unit and doc tests at same time * Parametrize test * Simplify test * Transform tuple * Improve test structure * gitignore IntelliJ / Pycharm folder * PandasLikeDataframe needs to deal with Sequence for columns. * isinstance check to sequence * Added string test to the tuple/list check for get_item. * Fix type annotation in test. * Improve type coverage for get_item tests. * Fix arrow dataframe get_item with tuple as well. * move tuple slice tests to slice_test.py * fix precommit * undo .idea from gitignore * rename slice_test to getitem_test * Revert changes in CONTRIBUTING.md * Update test names * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix for old polars versions --------- Co-authored-by: windiana42 Co-authored-by: Rik van der Vlist Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- narwhals/_arrow/dataframe.py | 3 + narwhals/_pandas_like/dataframe.py | 12 ++++ narwhals/_polars/dataframe.py | 3 + .../frame/{slice_test.py => getitem_test.py} | 57 +++++++++++++++++++ 4 files changed, 75 insertions(+) rename tests/frame/{slice_test.py => getitem_test.py} (82%) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 5b74ddb73..89919c763 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -136,6 +136,9 @@ def __getitem__( | tuple[slice, str | int] | tuple[slice, slice], ) -> ArrowSeries | ArrowDataFrame: + if isinstance(item, tuple): + item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) + if isinstance(item, str): from narwhals._arrow.series import ArrowSeries diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 53481c2f2..446c68a13 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -124,6 +124,14 @@ def __getitem__(self, item: slice) -> PandasLikeDataFrame: ... @overload def __getitem__(self, item: tuple[slice, slice]) -> Self: ... + @overload + def __getitem__( + self, item: tuple[Sequence[int], Sequence[int] | slice] + ) -> PandasLikeDataFrame: ... + + @overload + def __getitem__(self, item: tuple[slice, Sequence[int]]) -> PandasLikeDataFrame: ... + def __getitem__( self, item: str @@ -132,8 +140,12 @@ def __getitem__( | Sequence[int] | Sequence[str] | tuple[Sequence[int], str | int] + | tuple[slice | Sequence[int], Sequence[int] | slice] | tuple[slice, slice], ) -> PandasLikeSeries | PandasLikeDataFrame: + if isinstance(item, tuple): + item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) + if isinstance(item, str): from narwhals._pandas_like.series import PandasLikeSeries diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 6f61853f3..77d8a016b 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -91,6 +91,9 @@ def __getitem__(self, item: Any) -> Any: else: # pragma: no cover # TODO(marco): we can delete this branch after Polars==0.20.30 becomes the minimum # Polars version we support + if isinstance(item, tuple): + item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) + columns = self.columns if isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], slice): if isinstance(item[1].start, str) or isinstance(item[1].stop, str): diff --git a/tests/frame/slice_test.py b/tests/frame/getitem_test.py similarity index 82% rename from tests/frame/slice_test.py rename to tests/frame/getitem_test.py index a832daa66..894555b3d 100644 --- a/tests/frame/slice_test.py +++ b/tests/frame/getitem_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import numpy as np @@ -193,3 +195,58 @@ def test_slice_edge_cases(constructor_eager: Any) -> None: assert df[[], ["a"]].shape == (0, 1) assert df[:, :].shape == (3, 4) assert df[[], []].shape == (0, 0) + + +@pytest.mark.parametrize( + ("row_idx", "col_idx"), + [ + ([0, 2], [0]), + ((0, 2), [0]), + ([0, 2], (0,)), + ((0, 2), (0,)), + ([0, 2], range(1)), + (range(2), [0]), + (range(2), range(1)), + ], +) +def test_get_item_works_with_tuple_and_list_and_range_row_and_col_indexing( + constructor_eager: Any, + row_idx: list[int] | tuple[int] | range, + col_idx: list[int] | tuple[int] | range, +) -> None: + nw_df = nw.from_native(constructor_eager(data), eager_only=True) + nw_df[row_idx, col_idx] + + +@pytest.mark.parametrize( + ("row_idx", "col"), + [ + ([0, 2], slice(1)), + ((0, 2), slice(1)), + (range(2), slice(1)), + ], +) +def test_get_item_works_with_tuple_and_list_and_range_row_indexing_and_slice_col_indexing( + constructor_eager: Any, + row_idx: list[int] | tuple[int] | range, + col: slice, +) -> None: + nw_df = nw.from_native(constructor_eager(data), eager_only=True) + nw_df[row_idx, col] + + +@pytest.mark.parametrize( + ("row_idx", "col"), + [ + ([0, 2], "a"), + ((0, 2), "a"), + (range(2), "a"), + ], +) +def test_get_item_works_with_tuple_and_list_indexing_and_str( + constructor_eager: Any, + row_idx: list[int] | tuple[int] | range, + col: str, +) -> None: + nw_df = nw.from_native(constructor_eager(data), eager_only=True) + nw_df[row_idx, col] From 827d8decc050a65fa13279a3a91082891407b0bf Mon Sep 17 00:00:00 2001 From: Luciano <66913960+lucianosrp@users.noreply.github.com> Date: Sat, 21 Sep 2024 16:27:44 +0200 Subject: [PATCH 033/145] feat: add `str.len_chars` (#1036) --- docs/api-reference/expr_str.md | 1 + docs/api-reference/series_str.md | 1 + narwhals/_arrow/expr.py | 3 ++ narwhals/_arrow/series.py | 7 ++++ narwhals/_dask/expr.py | 5 +++ narwhals/_pandas_like/expr.py | 5 +++ narwhals/_pandas_like/series.py | 5 +++ narwhals/expr.py | 44 +++++++++++++++++++++ narwhals/series.py | 43 ++++++++++++++++++++ tests/expr_and_series/str/len_chars_test.py | 25 ++++++++++++ 10 files changed, 139 insertions(+) create mode 100644 tests/expr_and_series/str/len_chars_test.py diff --git a/docs/api-reference/expr_str.md b/docs/api-reference/expr_str.md index 8cb0dd9ed..15d66ac99 100644 --- a/docs/api-reference/expr_str.md +++ b/docs/api-reference/expr_str.md @@ -7,6 +7,7 @@ - contains - ends_with - head + - len_chars - slice - replace - replace_all diff --git a/docs/api-reference/series_str.md b/docs/api-reference/series_str.md index af657deff..7bbfccb67 100644 --- a/docs/api-reference/series_str.md +++ b/docs/api-reference/series_str.md @@ -7,6 +7,7 @@ - contains - ends_with - head + - len_chars - replace - replace_all - slice diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index b324be2b1..1aceb576f 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -429,6 +429,9 @@ class ArrowExprStringNamespace: def __init__(self, expr: ArrowExpr) -> None: self._expr = expr + def len_chars(self) -> ArrowExpr: + return reuse_series_namespace_implementation(self._expr, "str", "len_chars") + def replace( self, pattern: str, diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 4f53a3f00..90e800796 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -942,6 +942,13 @@ class ArrowSeriesStringNamespace: def __init__(self: Self, series: ArrowSeries) -> None: self._arrow_series = series + def len_chars(self) -> ArrowSeries: + import pyarrow.compute as pc # ignore-banned-import() + + return self._arrow_series._from_native_series( + pc.utf8_length(self._arrow_series._native_series) + ) + def replace( self, pattern: str, value: str, *, literal: bool = False, n: int = 1 ) -> ArrowSeries: diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index bb5f8ddcb..cd75e87bd 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -695,6 +695,11 @@ class DaskExprStringNamespace: def __init__(self, expr: DaskExpr) -> None: self._expr = expr + def len_chars(self) -> DaskExpr: + return self._expr._from_call( + lambda _input: _input.str.len(), "len", returns_scalar=False + ) + def replace( self, pattern: str, diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 409d1ab09..06be54394 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -380,6 +380,11 @@ class PandasLikeExprStringNamespace: def __init__(self, expr: PandasLikeExpr) -> None: self._expr = expr + def len_chars( + self, + ) -> PandasLikeExpr: + return reuse_series_namespace_implementation(self._expr, "str", "len_chars") + def replace( self, pattern: str, diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 14d7a128e..0092e97c8 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -693,6 +693,11 @@ class PandasLikeSeriesStringNamespace: def __init__(self, series: PandasLikeSeries) -> None: self._pandas_series = series + def len_chars(self) -> PandasLikeSeries: + return self._pandas_series._from_native_series( + self._pandas_series._native_series.str.len() + ) + def replace( self, pattern: str, value: str, *, literal: bool = False, n: int = 1 ) -> PandasLikeSeries: diff --git a/narwhals/expr.py b/narwhals/expr.py index b39a59818..b04a471da 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -2021,6 +2021,50 @@ class ExprStringNamespace: def __init__(self, expr: Expr) -> None: self._expr = expr + def len_chars(self) -> Expr: + r""" + Return the length of each string as the number of characters. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> data = {"words": ["foo", "Café", "345", "東京", None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + We define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... return df.with_columns(words_len=nw.col("words").str.len_chars()) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + words words_len + 0 foo 3.0 + 1 Café 4.0 + 2 345 3.0 + 3 東京 2.0 + 4 None NaN + + >>> func(df_pl) + shape: (5, 2) + ┌───────┬───────────┐ + │ words ┆ words_len │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════════╡ + │ foo ┆ 3 │ + │ Café ┆ 4 │ + │ 345 ┆ 3 │ + │ 東京 ┆ 2 │ + │ null ┆ null │ + └───────┴───────────┘ + """ + return self._expr.__class__(lambda plx: self._expr._call(plx).str.len_chars()) + def replace( self, pattern: str, value: str, *, literal: bool = False, n: int = 1 ) -> Expr: diff --git a/narwhals/series.py b/narwhals/series.py index f34217c99..b46d3268d 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2467,6 +2467,49 @@ class SeriesStringNamespace: def __init__(self, series: Series) -> None: self._narwhals_series = series + def len_chars(self) -> Series: + r""" + Return the length of each string as the number of characters. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> data = ["foo", "Café", "345", "東京", None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + + We define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... return s.str.len_chars() + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + 0 3.0 + 1 4.0 + 2 3.0 + 3 2.0 + 4 NaN + dtype: float64 + + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [u32] + [ + 3 + 4 + 3 + 2 + null + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.len_chars() + ) + def replace( self, pattern: str, value: str, *, literal: bool = False, n: int = 1 ) -> Series: diff --git a/tests/expr_and_series/str/len_chars_test.py b/tests/expr_and_series/str/len_chars_test.py new file mode 100644 index 000000000..ace145552 --- /dev/null +++ b/tests/expr_and_series/str/len_chars_test.py @@ -0,0 +1,25 @@ +from typing import Any + +import narwhals.stable.v1 as nw +from tests.utils import Constructor +from tests.utils import compare_dicts + +data = {"a": ["foo", "foobar", "Café", "345", "東京"]} + + +def test_str_len_chars(constructor: Constructor) -> None: + df = nw.from_native(constructor(data)) + result = df.select(nw.col("a").str.len_chars()) + expected = { + "a": [3, 6, 4, 3, 2], + } + compare_dicts(result, expected) + + +def test_str_len_chars_series(constructor_eager: Any) -> None: + df = nw.from_native(constructor_eager(data), eager_only=True) + expected = { + "a": [3, 6, 4, 3, 2], + } + result = df.select(df["a"].str.len_chars()) + compare_dicts(result, expected) From 691fead7534650e7cab44ab6bc36839dee4c7ad1 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 21 Sep 2024 20:58:13 +0200 Subject: [PATCH 034/145] feat: improve Series repr (#1038) --- narwhals/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/narwhals/series.py b/narwhals/series.py index b46d3268d..441900a31 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -256,14 +256,14 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se return function(self, *args, **kwargs) def __repr__(self) -> str: # pragma: no cover - header = " Narwhals Series " + header = " Narwhals Series " length = len(header) return ( "┌" + "─" * length + "┐\n" + f"|{header}|\n" - + "| Use `series.to_native()` to see native output |\n" + + "| Use `.to_native()` to see native output |\n" + "└" + "─" * length + "┘" From dc54e7ada2d07f4ea314745b8a7065a9027df83d Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Sat, 21 Sep 2024 15:14:07 -0400 Subject: [PATCH 035/145] xfail test_group_by_n_unique_w_missing for cuDF (#1039) --- tests/test_group_by.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_group_by.py b/tests/test_group_by.py index fa9c05f4b..666322c7b 100644 --- a/tests/test_group_by.py +++ b/tests/test_group_by.py @@ -123,7 +123,13 @@ def test_group_by_std(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_group_by_n_unique_w_missing(constructor: Constructor) -> None: +def test_group_by_n_unique_w_missing( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "cudf" in str(constructor): + # Issue in cuDF https://github.com/rapidsai/cudf/issues/16861 + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 1, 2], "b": [4, None, 5], "c": [None, None, 7], "d": [1, 1, 3]} result = ( nw.from_native(constructor(data)) From 6ddd7544edccd8b3ffef57a563f955b9d5cee0e0 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Mon, 23 Sep 2024 02:45:04 -0400 Subject: [PATCH 036/145] fix: add cuDF specific implementation for join `how="anti"` (#1041) * xfail tests that use indicator for cuDF * add cuDF specific return for join anti * Revert "xfail tests that use indicator for cuDF" This reverts commit 8446740cf927c7c4a21824e3dee375aa0fc977f0. * add pragma: no cover for cuDF --- narwhals/_pandas_like/dataframe.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 446c68a13..cd810b3ee 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -485,17 +485,27 @@ def join( ) .drop_duplicates() ) - return self._from_native_frame( - self._native_frame.merge( - other_native, - how="outer", - indicator=indicator_token, - left_on=left_on, - right_on=left_on, + if self._implementation is Implementation.CUDF: # pragma: no cover + return self._from_native_frame( + self._native_frame.merge( + other_native, + how="leftanti", + left_on=left_on, + right_on=left_on, + ) + ) + else: + return self._from_native_frame( + self._native_frame.merge( + other_native, + how="outer", + indicator=indicator_token, + left_on=left_on, + right_on=left_on, + ) + .loc[lambda t: t[indicator_token] == "left_only"] + .drop(columns=indicator_token) ) - .loc[lambda t: t[indicator_token] == "left_only"] - .drop(columns=indicator_token) - ) if how == "semi": other_native = ( From 3f1619da7fc7f8f389e4d00797d029196b9a807b Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 23 Sep 2024 14:02:37 +0200 Subject: [PATCH 037/145] Revert "fix: add cuDF specific implementation for join `how="anti"` (#1041)" This reverts commit 6ddd7544edccd8b3ffef57a563f955b9d5cee0e0. --- narwhals/_pandas_like/dataframe.py | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index cd810b3ee..446c68a13 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -485,27 +485,17 @@ def join( ) .drop_duplicates() ) - if self._implementation is Implementation.CUDF: # pragma: no cover - return self._from_native_frame( - self._native_frame.merge( - other_native, - how="leftanti", - left_on=left_on, - right_on=left_on, - ) - ) - else: - return self._from_native_frame( - self._native_frame.merge( - other_native, - how="outer", - indicator=indicator_token, - left_on=left_on, - right_on=left_on, - ) - .loc[lambda t: t[indicator_token] == "left_only"] - .drop(columns=indicator_token) + return self._from_native_frame( + self._native_frame.merge( + other_native, + how="outer", + indicator=indicator_token, + left_on=left_on, + right_on=left_on, ) + .loc[lambda t: t[indicator_token] == "left_only"] + .drop(columns=indicator_token) + ) if how == "semi": other_native = ( From 5281c4014e94edc99d40db799514c1772c6e1040 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 23 Sep 2024 19:42:43 +0200 Subject: [PATCH 038/145] feat: add `Series.dtype` return annotation (#1049) --- narwhals/_arrow/series.py | 2 +- narwhals/_interchange/series.py | 4 ++-- narwhals/_pandas_like/series.py | 2 +- narwhals/_polars/series.py | 2 +- narwhals/series.py | 5 +++-- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 90e800796..09fade682 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -359,7 +359,7 @@ def alias(self, name: str) -> Self: ) @property - def dtype(self) -> DType: + def dtype(self: Self) -> DType: return translate_dtype(self._native_series.type) def abs(self) -> Self: diff --git a/narwhals/_interchange/series.py b/narwhals/_interchange/series.py index 06a9169df..70f84d12f 100644 --- a/narwhals/_interchange/series.py +++ b/narwhals/_interchange/series.py @@ -7,7 +7,7 @@ from narwhals._interchange.dataframe import map_interchange_dtype_to_narwhals_dtype if TYPE_CHECKING: - from narwhals import dtypes + from narwhals.dtypes import DType class InterchangeSeries: @@ -18,7 +18,7 @@ def __narwhals_series__(self) -> Any: return self @property - def dtype(self) -> dtypes.DType: + def dtype(self) -> DType: return map_interchange_dtype_to_narwhals_dtype(self._native_series.dtype) def __getattr__(self, attr: str) -> NoReturn: diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 0092e97c8..1b40d69de 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -165,7 +165,7 @@ def shape(self) -> tuple[int]: return self._native_series.shape # type: ignore[no-any-return] @property - def dtype(self) -> DType: + def dtype(self: Self) -> DType: return translate_dtype(self._native_series) def scatter(self, indices: int | Sequence[int], values: Any) -> Self: diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index e71520042..582ab75dc 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -76,7 +76,7 @@ def name(self) -> str: return self._native_series.name # type: ignore[no-any-return] @property - def dtype(self) -> DType: + def dtype(self: Self) -> DType: return translate_dtype(self._native_series.dtype) @overload diff --git a/narwhals/series.py b/narwhals/series.py index 441900a31..619e4b553 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -16,6 +16,7 @@ from typing_extensions import Self from narwhals.dataframe import DataFrame + from narwhals.dtypes import DType class Series: @@ -302,7 +303,7 @@ def len(self) -> int: return len(self._compliant_series) @property - def dtype(self) -> Any: + def dtype(self: Self) -> DType: """ Get the data type of the Series. @@ -327,7 +328,7 @@ def dtype(self) -> Any: >>> func(s_pl) Int64 """ - return self._compliant_series.dtype + return self._compliant_series.dtype # type: ignore[no-any-return] @property def name(self) -> str: From 0e26ef657388f827f71511b160d98a8cf1828d6e Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 23 Sep 2024 23:10:18 +0200 Subject: [PATCH 039/145] fix: move `__len__` to `DataFrame` only (#1053) --- narwhals/dataframe.py | 6 +++--- tests/frame/len_test.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 83ebdcdc5..581c8617d 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -41,9 +41,6 @@ class BaseFrame(Generic[FrameT]): _compliant_frame: Any _level: Literal["full", "interchange"] - def __len__(self) -> Any: - return self._compliant_frame.__len__() - def __native_namespace__(self) -> Any: return self._compliant_frame.__native_namespace__() @@ -316,6 +313,9 @@ def __init__( msg = f"Expected an object which implements `__narwhals_dataframe__`, got: {type(df)}" raise AssertionError(msg) + def __len__(self) -> Any: + return self._compliant_frame.__len__() + def __array__(self, dtype: Any = None, copy: bool | None = None) -> np.ndarray: return self._compliant_frame.__array__(dtype, copy=copy) diff --git a/tests/frame/len_test.py b/tests/frame/len_test.py index 6cb9269de..c06884e03 100644 --- a/tests/frame/len_test.py +++ b/tests/frame/len_test.py @@ -9,5 +9,5 @@ def test_len(constructor_eager: Any) -> None: - result = len(nw.from_native(constructor_eager(data))) + result = len(nw.from_native(constructor_eager(data), eager_only=True)) assert result == 4 From b4c31b383c05bc331fe481525e2c76fb1ac5e37b Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 24 Sep 2024 09:26:36 +0200 Subject: [PATCH 040/145] feat: add Series.__iter__ (#1057) --- docs/api-reference/series.md | 1 + narwhals/_arrow/series.py | 4 ++++ narwhals/_expression_parsing.py | 8 ++------ narwhals/_pandas_like/series.py | 4 ++++ narwhals/series.py | 4 ++++ tests/series_only/__iter___test.py | 16 ++++++++++++++++ utils/check_api_reference.py | 1 + 7 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 tests/series_only/__iter___test.py diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 9868e7b98..5adaa57ab 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -6,6 +6,7 @@ members: - __arrow_c_stream__ - __getitem__ + - __iter__ - abs - alias - all diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 09fade682..8c8643791 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable +from typing import Iterator from typing import Literal from typing import Sequence from typing import overload @@ -702,6 +703,9 @@ def mode(self: Self) -> ArrowSeries: plx.col(col_token) == plx.col(col_token).max() )[self.name] + def __iter__(self: Self) -> Iterator[Any]: + yield from self._native_series.__iter__() + @property def shape(self) -> tuple[int]: return (len(self._native_series),) diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index 10d686ab1..3ccf906ff 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -12,7 +12,6 @@ from typing import overload from narwhals.dependencies import is_numpy_array -from narwhals.utils import flatten if TYPE_CHECKING: from narwhals._arrow.dataframe import ArrowDataFrame @@ -95,7 +94,7 @@ def evaluate_into_exprs( """Evaluate each expr into Series.""" series: ListOfCompliantSeries = [ # type: ignore[assignment] item - for sublist in (evaluate_into_expr(df, into_expr) for into_expr in flatten(exprs)) + for sublist in (evaluate_into_expr(df, into_expr) for into_expr in exprs) for item in sublist ] for name, expr in named_exprs.items(): @@ -157,9 +156,7 @@ def parse_into_exprs( ) -> ListOfCompliantExpr: """Parse each input as an expression (if it's not already one). See `parse_into_expr` for more details.""" - return [ - parse_into_expr(into_expr, namespace=namespace) for into_expr in flatten(exprs) - ] + [ + return [parse_into_expr(into_expr, namespace=namespace) for into_expr in exprs] + [ parse_into_expr(expr, namespace=namespace).alias(name) for name, expr in named_exprs.items() ] @@ -181,7 +178,6 @@ def parse_into_expr( - if it's a string, then convert it to an expression - else, raise """ - if hasattr(into_expr, "__narwhals_expr__"): return into_expr # type: ignore[return-value] if hasattr(into_expr, "__narwhals_series__"): diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 1b40d69de..873683dd5 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable +from typing import Iterator from typing import Literal from typing import Sequence from typing import overload @@ -665,6 +666,9 @@ def mode(self: Self) -> Self: result.name = native_series.name return self._from_native_series(result) + def __iter__(self: Self) -> Iterator[Any]: + yield from self._native_series.__iter__() + @property def str(self) -> PandasLikeSeriesStringNamespace: return PandasLikeSeriesStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index 619e4b553..ae78eef76 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from typing import Any from typing import Callable +from typing import Iterator from typing import Literal from typing import Sequence from typing import overload @@ -2407,6 +2408,9 @@ def mode(self: Self) -> Self: """ return self._from_compliant_series(self._compliant_series.mode()) + def __iter__(self: Self) -> Iterator[Any]: + yield from self._compliant_series.__iter__() + @property def str(self) -> SeriesStringNamespace: return SeriesStringNamespace(self) diff --git a/tests/series_only/__iter___test.py b/tests/series_only/__iter___test.py new file mode 100644 index 000000000..d190cd80a --- /dev/null +++ b/tests/series_only/__iter___test.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from collections.abc import Iterable +from typing import Any + +import narwhals.stable.v1 as nw +from tests.utils import compare_dicts + +data = [1, 2, 3] + + +def test_to_list(constructor_eager: Any) -> None: + s = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] + + assert isinstance(s, Iterable) + compare_dicts({"a": [x for x in s]}, {"a": [1, 2, 3]}) # noqa: C416 diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index a56b28c58..d11c2bc99 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -28,6 +28,7 @@ "item", "scatter", "to_native", + "__iter__", } BASE_DTYPES = {"NumericType", "DType", "TemporalType"} From 92884de5ecb36a13137ca58adcaa4e7dce035ae2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 24 Sep 2024 09:26:46 +0200 Subject: [PATCH 041/145] [pre-commit.ci] pre-commit autoupdate (#1052) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.6.5 → v0.6.7](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.5...v0.6.7) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e97fce29d..08d274f86 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.6.5' + rev: 'v0.6.7' hooks: # Run the formatter. - id: ruff-format From 8d56c0566e227027323ecd21f4dc4fdbc48cdba3 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 24 Sep 2024 09:54:05 +0200 Subject: [PATCH 042/145] release: Bump version to 1.9.0 (#1058) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 2f8012546..0f031b398 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.8.2' +'1.9.0' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index efc5cd8c1..8c63e443e 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -53,7 +53,7 @@ from narwhals.utils import maybe_get_index from narwhals.utils import maybe_set_index -__version__ = "1.8.2" +__version__ = "1.9.0" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index f2109e117..26cd3b8fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.8.2" +version = "1.9.0" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From 74255272e0fba1238a1503704d5ce9b1a262fc11 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 24 Sep 2024 10:12:33 +0200 Subject: [PATCH 043/145] release: Bump version to 1.9.1 (#1059) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 0f031b398..bdc48abf2 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.9.0' +'1.9.1' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 8c63e443e..a5c9830ff 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -53,7 +53,7 @@ from narwhals.utils import maybe_get_index from narwhals.utils import maybe_set_index -__version__ = "1.9.0" +__version__ = "1.9.1" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index 26cd3b8fd..c1d1ccce8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.9.0" +version = "1.9.1" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From 0a97364e95e6938789b9be3fcda63e5e2562ba0b Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 24 Sep 2024 09:29:57 +0100 Subject: [PATCH 044/145] ci: improve release process so it fails early (#1060) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- utils/bump_version.py | 8 +++++++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index bdc48abf2..2f8012546 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.9.1' +'1.8.2' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index a5c9830ff..efc5cd8c1 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -53,7 +53,7 @@ from narwhals.utils import maybe_get_index from narwhals.utils import maybe_set_index -__version__ = "1.9.1" +__version__ = "1.8.2" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index c1d1ccce8..f2109e117 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.9.1" +version = "1.8.2" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] diff --git a/utils/bump_version.py b/utils/bump_version.py index 1f6869897..f53acf9e4 100644 --- a/utils/bump_version.py +++ b/utils/bump_version.py @@ -4,7 +4,13 @@ import subprocess import sys -subprocess.run(["git", "fetch", "upstream"]) +out = subprocess.run(["git", "fetch", "upstream"]) +if out.returncode != 0: + print( + "Something went wrong with the release process, please check the Narwhals Wiki and try again." + ) + print(out) + sys.exit(1) subprocess.run(["git", "reset", "--hard", "upstream/main"]) how = sys.argv[1] From 476a1d3eb5ff16d5204a30897a9a26d1852a2e67 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 24 Sep 2024 09:34:04 +0100 Subject: [PATCH 045/145] release: Bump version to 1.8.3 (#1061) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 2f8012546..28a07d5b0 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.8.2' +'1.8.3' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index efc5cd8c1..1313f1138 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -53,7 +53,7 @@ from narwhals.utils import maybe_get_index from narwhals.utils import maybe_set_index -__version__ = "1.8.2" +__version__ = "1.8.3" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index f2109e117..13128e4f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.8.2" +version = "1.8.3" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From d019d6c269bc04d4ba368ee077de77d47c7f5ab6 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 24 Sep 2024 11:08:29 +0100 Subject: [PATCH 046/145] ci: delete all local tags before making release (#1062) --- utils/bump_version.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/utils/bump_version.py b/utils/bump_version.py index f53acf9e4..4230e6899 100644 --- a/utils/bump_version.py +++ b/utils/bump_version.py @@ -4,7 +4,7 @@ import subprocess import sys -out = subprocess.run(["git", "fetch", "upstream"]) +out = subprocess.run(["git", "fetch", "upstream", "--tags"]) if out.returncode != 0: print( "Something went wrong with the release process, please check the Narwhals Wiki and try again." @@ -13,6 +13,24 @@ sys.exit(1) subprocess.run(["git", "reset", "--hard", "upstream/main"]) +# Delete local tags, if present +try: + # Get the list of all tags + result = subprocess.run( + ["git", "tag", "-l"], capture_output=True, text=True, check=True + ) + tags = result.stdout.splitlines() # Split the tags into a list by lines + + # Delete each tag using git tag -d + for tag in tags: + subprocess.run(["git", "tag", "-d", tag], check=True) + print("All local tags have been deleted.") +except subprocess.CalledProcessError as e: + print(f"An error occurred: {e}") + +subprocess.run(["git", "fetch", "upstream", "--tags"]) +subprocess.run(["git", "fetch", "upstream", "--prune", "--tags"]) + how = sys.argv[1] with open("pyproject.toml", encoding="utf-8") as f: From 2c9e2e7a308ebb30c6f672e27c1da2086ebbecbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Wed, 25 Sep 2024 16:06:23 +0200 Subject: [PATCH 047/145] build: add PR labels to ignore bots in release drafter (#1069) --- .github/release-drafter.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 8f4c98811..d8928bce6 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -1,6 +1,7 @@ exclude-labels: - - skip changelog + - ignore - release + - dependencies name-template: 'Narwhals v$RESOLVED_VERSION' change-template: '- $TITLE (#$NUMBER)' @@ -34,6 +35,9 @@ autolabeler: - label: release title: - '/^([Rr]elease)/' + - label: ignore + title: + - '/^\[pre-commit.ci\]/' version-resolver: major: From b4aaa4ac87e5df0a3a12afa0eea1ba6f6fe18dfe Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Thu, 26 Sep 2024 03:26:21 -0400 Subject: [PATCH 048/145] fix: add cuDF specific implementation for join how="anti" (#1056) * add cuDF specific return for join anti * correct argument on cuDF implementation * fix right_on param --- narwhals/_pandas_like/dataframe.py | 48 ++++++++++++++++++------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 446c68a13..097429724 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -474,28 +474,38 @@ def join( ) if how == "anti": - indicator_token = generate_unique_token( - n_bytes=8, columns=[*self.columns, *other.columns] - ) + if self._implementation is Implementation.CUDF: # pragma: no cover + return self._from_native_frame( + self._native_frame.merge( + other._native_frame, + how="leftanti", + left_on=left_on, + right_on=right_on, + ) + ) + else: + indicator_token = generate_unique_token( + n_bytes=8, columns=[*self.columns, *other.columns] + ) - other_native = ( - other._native_frame.loc[:, right_on] - .rename( # rename to avoid creating extra columns in join - columns=dict(zip(right_on, left_on)) # type: ignore[arg-type] + other_native = ( + other._native_frame.loc[:, right_on] + .rename( # rename to avoid creating extra columns in join + columns=dict(zip(right_on, left_on)) # type: ignore[arg-type] + ) + .drop_duplicates() ) - .drop_duplicates() - ) - return self._from_native_frame( - self._native_frame.merge( - other_native, - how="outer", - indicator=indicator_token, - left_on=left_on, - right_on=left_on, + return self._from_native_frame( + self._native_frame.merge( + other_native, + how="outer", + indicator=indicator_token, + left_on=left_on, + right_on=left_on, + ) + .loc[lambda t: t[indicator_token] == "left_only"] + .drop(columns=indicator_token) ) - .loc[lambda t: t[indicator_token] == "left_only"] - .drop(columns=indicator_token) - ) if how == "semi": other_native = ( From e49dba107b9207146af03adb26999a336e00eb86 Mon Sep 17 00:00:00 2001 From: Cheuk Ting Ho Date: Thu, 26 Sep 2024 11:13:01 +0200 Subject: [PATCH 049/145] feat: Adding nth() for selecting columns (#1044) * adding nth * add more test, fix docstrings * adding nth in __all__ * remove rool names as it is unknown * raise error if polars < 0.20.26 * fix tests * refactor tests * re-refactor tests * add warning in docstring * Update narwhals/expr.py Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> * Update tests/translate/select_test.py Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> * Update tests/expr_and_series/nth_test.py Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> * fix docs * fix stable docs * fix doctest * tighten annotaiton * warning -> notes * doctests * fixup * set 1.0 as minimum polars version --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- docs/api-reference/narwhals.md | 1 + docs/requirements-docs.txt | 1 + narwhals/__init__.py | 2 ++ narwhals/_arrow/expr.py | 25 +++++++++++++ narwhals/_arrow/namespace.py | 7 ++++ narwhals/_dask/expr.py | 21 +++++++++++ narwhals/_dask/namespace.py | 6 ++++ narwhals/_pandas_like/expr.py | 27 ++++++++++++++ narwhals/_pandas_like/namespace.py | 7 ++++ narwhals/_polars/namespace.py | 9 +++++ narwhals/expr.py | 56 ++++++++++++++++++++++++++++++ narwhals/stable/v1.py | 52 +++++++++++++++++++++++++++ tests/expr_and_series/nth_test.py | 48 +++++++++++++++++++++++++ 13 files changed, 262 insertions(+) create mode 100644 tests/expr_and_series/nth_test.py diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index 9d7ac384e..d678b8732 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -28,6 +28,7 @@ Here are the top-level functions available in Narwhals. - min - narwhalify - new_series + - nth - sum - sum_horizontal - when diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 7cd67b471..929f35790 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,6 +1,7 @@ jinja2 markdown-exec[ansi] mkdocs +mkdocs-autorefs mkdocs-material mkdocstrings[python] polars>=1.0.0 diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 1313f1138..a5f95cf70 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -33,6 +33,7 @@ from narwhals.expr import mean from narwhals.expr import mean_horizontal from narwhals.expr import min +from narwhals.expr import nth from narwhals.expr import sum from narwhals.expr import sum_horizontal from narwhals.expr import when @@ -80,6 +81,7 @@ "max", "mean", "mean_horizontal", + "nth", "sum", "sum_horizontal", "when", diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 1aceb576f..367dc9b44 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -73,6 +73,31 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: backend_version=backend_version, ) + @classmethod + def from_column_indices( + cls: type[Self], *column_indices: int, backend_version: tuple[int, ...] + ) -> Self: + from narwhals._arrow.series import ArrowSeries + + def func(df: ArrowDataFrame) -> list[ArrowSeries]: + return [ + ArrowSeries( + df._native_frame[column_index], + name=df._native_frame.column_names[column_index], + backend_version=df._backend_version, + ) + for column_index in column_indices + ] + + return cls( + func, + depth=0, + function_name="nth", + root_names=None, + output_names=None, + backend_version=backend_version, + ) + def __narwhals_namespace__(self) -> ArrowNamespace: from narwhals._arrow.namespace import ArrowNamespace diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index ed14dd335..f4c3c9eeb 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -110,6 +110,13 @@ def col(self, *column_names: str) -> ArrowExpr: *column_names, backend_version=self._backend_version ) + def nth(self, *column_indices: int) -> ArrowExpr: + from narwhals._arrow.expr import ArrowExpr + + return ArrowExpr.from_column_indices( + *column_indices, backend_version=self._backend_version + ) + def len(self) -> ArrowExpr: # coverage bug? this is definitely hit return ArrowExpr( # pragma: no cover diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index cd75e87bd..e1fb5f87f 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -71,6 +71,27 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: backend_version=backend_version, ) + @classmethod + def from_column_indices( + cls: type[Self], + *column_indices: int, + backend_version: tuple[int, ...], + ) -> Self: + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + return [ + df._native_frame.iloc[:, column_index] for column_index in column_indices + ] + + return cls( + func, + depth=0, + function_name="nth", + root_names=None, + output_names=None, + returns_scalar=False, + backend_version=backend_version, + ) + def _from_call( self, # First argument to `call` should be `dask_expr.Series` diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 1668ee323..ead7cf109 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -71,6 +71,12 @@ def col(self, *column_names: str) -> DaskExpr: backend_version=self._backend_version, ) + def nth(self, *column_indices: int) -> DaskExpr: + return DaskExpr.from_column_indices( + *column_indices, + backend_version=self._backend_version, + ) + def lit(self, value: Any, dtype: dtypes.DType | None) -> DaskExpr: def convert_if_dtype( series: dask_expr.Series, dtype: DType | type[DType] diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 06be54394..512699515 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -81,6 +81,33 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: backend_version=backend_version, ) + @classmethod + def from_column_indices( + cls: type[Self], + *column_indices: int, + implementation: Implementation, + backend_version: tuple[int, ...], + ) -> Self: + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + return [ + PandasLikeSeries( + df._native_frame.iloc[:, column_index], + implementation=df._implementation, + backend_version=df._backend_version, + ) + for column_index in column_indices + ] + + return cls( + func, + depth=0, + function_name="nth", + root_names=None, + output_names=None, + implementation=implementation, + backend_version=backend_version, + ) + def cast( self, dtype: Any, diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 37c889549..804c0e8a1 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -112,6 +112,13 @@ def col(self, *column_names: str) -> PandasLikeExpr: backend_version=self._backend_version, ) + def nth(self, *column_indices: int) -> PandasLikeExpr: + return PandasLikeExpr.from_column_indices( + *column_indices, + implementation=self._implementation, + backend_version=self._backend_version, + ) + def all(self) -> PandasLikeExpr: return PandasLikeExpr( lambda df: [ diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index 48ee8ebc0..be9cf7a8c 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -56,6 +56,15 @@ def func(*args: Any, **kwargs: Any) -> Any: return func + def nth(self, *indices: int) -> PolarsExpr: + from narwhals._polars.expr import PolarsExpr + + pl = get_polars() + if self._backend_version < (1, 0, 0): # pragma: no cover + msg = "`nth` is only supported for Polars>=1.0.0. Please use `col` for columns selection instead." + raise AttributeError(msg) + return PolarsExpr(pl.nth(*indices)) + def len(self) -> PolarsExpr: from narwhals._polars.expr import PolarsExpr diff --git a/narwhals/expr.py b/narwhals/expr.py index b04a471da..8ff626d52 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -5,6 +5,7 @@ from typing import Callable from typing import Iterable from typing import Literal +from typing import Sequence from narwhals.dependencies import is_numpy_array from narwhals.utils import flatten @@ -3741,6 +3742,61 @@ def func(plx: Any) -> Any: return Expr(func) +def nth(*indices: int | Sequence[int]) -> Expr: + """ + Creates an expression that references one or more columns by their index(es). + + Notes: + `nth` is not supported for Polars version<1.0.0. Please use [`col`](/api-reference/narwhals/#narwhals.col) instead. + + Arguments: + indices: One or more indices representing the columns to retrieve. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.nth(0) * 2) + + We can then pass either pandas or polars to `func`: + + >>> func(df_pd) + a + 0 2 + 1 4 + >>> func(df_pl) # doctest: +SKIP + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 4 │ + └─────┘ + >>> func(df_pa) + pyarrow.Table + a: int64 + ---- + a: [[2,4]] + """ + + def func(plx: Any) -> Any: + return plx.nth(*flatten(indices)) + + return Expr(func) + + # Add underscore so it doesn't conflict with builtin `all` def all_() -> Expr: """ diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 0f34c88fb..cc0a42bed 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -940,6 +940,57 @@ def col(*names: str | Iterable[str]) -> Expr: return _stableify(nw.col(*names)) +def nth(*indices: int | Sequence[int]) -> Expr: + """ + Creates an expression that references one or more columns by their index(es). + + Notes: + `nth` is not supported for Polars version<1.0.0. Please use [`col`](/api-reference/narwhals/#narwhals.col) instead. + + Arguments: + indices: One or more indices representing the columns to retrieve. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals.stable.v1 as nw + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.nth(0) * 2) + + We can then pass either pandas or polars to `func`: + + >>> func(df_pd) + a + 0 2 + 1 4 + >>> func(df_pl) # doctest: +SKIP + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 4 │ + └─────┘ + >>> func(df_pa) + pyarrow.Table + a: int64 + ---- + a: [[2,4]] + """ + return _stableify(nw.nth(*indices)) + + def len() -> Expr: """ Return the number of rows. @@ -1887,6 +1938,7 @@ def from_dict( "all_horizontal", "any_horizontal", "col", + "nth", "len", "lit", "min", diff --git a/tests/expr_and_series/nth_test.py b/tests/expr_and_series/nth_test.py new file mode 100644 index 000000000..00a8b5c9d --- /dev/null +++ b/tests/expr_and_series/nth_test.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import Any + +import polars as pl +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version +from tests.utils import Constructor +from tests.utils import compare_dicts + +data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]} + + +@pytest.mark.parametrize( + ("idx", "expected"), + [ + (0, {"a": [1, 3, 2]}), + ([0, 1], {"a": [1, 3, 2], "b": [4, 4, 6]}), + ([0, 2], {"a": [1, 3, 2], "z": [7.1, 8, 9]}), + ], +) +def test_nth( + constructor: Constructor, + idx: int | list[int], + expected: dict[str, list[int]], + request: Any, +) -> None: + if "polars" in str(constructor) and parse_version(pl.__version__) < parse_version( + "1.0.0" + ): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) + result = df.select(nw.nth(idx)) + compare_dicts(result, expected) + + +@pytest.mark.skipif( + parse_version(pl.__version__) >= parse_version("1.0.0"), + reason="1.0.0", +) +def test_nth_not_supported() -> None: # pragma: no cover + df = nw.from_native(pl.DataFrame(data)) + with pytest.raises( + AttributeError, match="`nth` is only supported for Polars>=1.0.0." + ): + df.select(nw.nth(0)) From a8cb962df80df97b2837aaf21db1fe3e827fc119 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Thu, 26 Sep 2024 21:57:59 +0100 Subject: [PATCH 050/145] feat: add `series.rename` (#1073) * add series rename method * sort api reference * add alias and rename test * Update narwhals/series.py Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> * add pyarrow series to examples --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- docs/api-reference/series.md | 1 + narwhals/series.py | 65 +++++++++++++++++++++++++- tests/series_only/alias_rename_test.py | 13 ++++++ utils/check_api_reference.py | 21 +++++---- 4 files changed, 89 insertions(+), 11 deletions(-) create mode 100644 tests/series_only/alias_rename_test.py diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 5adaa57ab..e8572dda8 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -43,6 +43,7 @@ - null_count - pipe - quantile + - rename - round - sample - scatter diff --git a/narwhals/series.py b/narwhals/series.py index ae78eef76..ab8b012b8 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1187,10 +1187,12 @@ def alias(self, name: str) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> s = [1, 2, 3] >>> s_pd = pd.Series(s, name="foo") >>> s_pl = pl.Series("foo", s) + >>> s_pa = pa.chunked_array([s]) We define a library agnostic function: @@ -1198,7 +1200,7 @@ def alias(self, name: str) -> Self: ... def func(s): ... return s.alias("bar") - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or PyArrow: >>> func(s_pd) 0 1 @@ -1213,9 +1215,70 @@ def alias(self, name: str) -> Self: 2 3 ] + >>> func(s_pa) # doctest: +ELLIPSIS + + [ + [ + 1, + 2, + 3 + ] + ] """ return self._from_compliant_series(self._compliant_series.alias(name=name)) + def rename(self, name: str) -> Self: + """ + Rename the Series. + + Alias for `Series.alias()`. + + Arguments: + name: The new name. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> s = [1, 2, 3] + >>> s_pd = pd.Series(s, name="foo") + >>> s_pl = pl.Series("foo", s) + >>> s_pa = pa.chunked_array([s]) + + We define a library agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... return s.rename("bar") + + We can then pass any supported library such as pandas, Polars, or PyArrow: + + >>> func(s_pd) + 0 1 + 1 2 + 2 3 + Name: bar, dtype: int64 + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: 'bar' [i64] + [ + 1 + 2 + 3 + ] + >>> func(s_pa) # doctest: +ELLIPSIS + + [ + [ + 1, + 2, + 3 + ] + ] + """ + return self.alias(name=name) + def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: """ Sort this Series. Place null values first. diff --git a/tests/series_only/alias_rename_test.py b/tests/series_only/alias_rename_test.py new file mode 100644 index 000000000..4fa8a9993 --- /dev/null +++ b/tests/series_only/alias_rename_test.py @@ -0,0 +1,13 @@ +import narwhals as nw +from tests.utils import Constructor +from tests.utils import compare_dicts + + +def test_alias_rename(constructor_eager: Constructor) -> None: + data = [1, 2, 3] + expected = {"bar": data} + series = nw.from_native(constructor_eager({"foo": data}), eager_only=True)["foo"] + result = series.alias("bar").to_frame() + compare_dicts(result, expected) + result = series.rename("bar").to_frame() + compare_dicts(result, expected) diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index d11c2bc99..410956e04 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -12,22 +12,23 @@ NAMESPACES = {"dt", "str", "cat", "name"} EXPR_ONLY_METHODS = {"over"} SERIES_ONLY_METHODS = { - "to_arrow", - "to_dummies", - "to_pandas", - "to_list", - "to_numpy", "dtype", - "name", - "shape", - "to_frame", "is_empty", "is_sorted", - "value_counts", - "zip_with", "item", + "name", + "rename", "scatter", + "shape", + "to_arrow", + "to_dummies", + "to_frame", + "to_list", "to_native", + "to_numpy", + "to_pandas", + "value_counts", + "zip_with", "__iter__", } BASE_DTYPES = {"NumericType", "DType", "TemporalType"} From 480306e50b4e3c60ce36df097cd18320fba97a91 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:13:06 +0200 Subject: [PATCH 051/145] fix: pandas and dask group by `observed=True` (#1079) * fix: pandas and dask group by observed=True * xfail old pyarrow * xfail old pyarrow --- narwhals/_dask/group_by.py | 1 + narwhals/_pandas_like/group_by.py | 2 ++ tests/test_group_by.py | 24 ++++++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py index d5fbaaf94..d79c95d7b 100644 --- a/narwhals/_dask/group_by.py +++ b/narwhals/_dask/group_by.py @@ -47,6 +47,7 @@ def __init__(self, df: DaskLazyFrame, keys: list[str]) -> None: self._grouped = self._df._native_frame.groupby( list(self._keys), dropna=False, + observed=True, ) def agg( diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index a0ee03f7d..55c038f9d 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -40,6 +40,7 @@ def __init__(self, df: PandasLikeDataFrame, keys: list[str]) -> None: list(self._keys), sort=False, as_index=True, + observed=True, ) else: self._grouped = self._df._native_frame.groupby( @@ -47,6 +48,7 @@ def __init__(self, df: PandasLikeDataFrame, keys: list[str]) -> None: sort=False, as_index=True, dropna=False, + observed=True, ) def agg( diff --git a/tests/test_group_by.py b/tests/test_group_by.py index 666322c7b..90cb48c26 100644 --- a/tests/test_group_by.py +++ b/tests/test_group_by.py @@ -260,3 +260,27 @@ def test_no_agg(constructor: Constructor) -> None: expected = {"a": [1, 3], "b": [4, 6]} compare_dicts(result, expected) + + +def test_group_by_categorical( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyarrow_table" in str(constructor) and parse_version(pa.__version__) < ( + 15, + 0, + 0, + ): # pragma: no cover + request.applymarker(pytest.mark.xfail) + + data = {"g1": ["a", "a", "b", "b"], "g2": ["x", "y", "x", "z"], "x": [1, 2, 3, 4]} + df = nw.from_native(constructor(data)) + result = ( + df.with_columns( + g1=nw.col("g1").cast(nw.Categorical()), + g2=nw.col("g2").cast(nw.Categorical()), + ) + .group_by(["g1", "g2"]) + .agg(nw.col("x").sum()) + .sort("x") + ) + compare_dicts(result, data) From 00ab2c331411b772589ad70192927776141819fd Mon Sep 17 00:00:00 2001 From: Cheuk Ting Ho Date: Fri, 27 Sep 2024 11:32:40 +0200 Subject: [PATCH 052/145] fix: fixing `nw.all()` doesn't work in sum_horizontal (#1075) --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- narwhals/_arrow/namespace.py | 50 ++++++++++++++-- narwhals/_dask/namespace.py | 62 +++++++++++++++++--- narwhals/_expression_parsing.py | 13 ++++ narwhals/_pandas_like/namespace.py | 50 ++++++++++++++-- narwhals/expr.py | 12 ++++ tests/expr_and_series/all_horizontal_test.py | 56 ++++++++++++++++++ tests/expr_and_series/any_horizontal_test.py | 14 +++++ tests/expr_and_series/sum_horizontal_test.py | 15 +++++ 8 files changed, 254 insertions(+), 18 deletions(-) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index f4c3c9eeb..e7cbbae2e 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -13,6 +13,7 @@ from narwhals._arrow.series import ArrowSeries from narwhals._arrow.utils import horizontal_concat from narwhals._arrow.utils import vertical_concat +from narwhals._expression_parsing import combine_root_names from narwhals._expression_parsing import parse_into_exprs from narwhals.utils import Implementation @@ -175,15 +176,54 @@ def _lit_arrow_series(_: ArrowDataFrame) -> ArrowSeries: ) def all_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: - return reduce(lambda x, y: x & y, parse_into_exprs(*exprs, namespace=self)) + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: ArrowDataFrame) -> list[ArrowSeries]: + series = [] + for _expr in parsed_exprs: + series.extend(list(_expr._call(df))) + return [reduce(lambda x, y: x & y, series)] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="all_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=parsed_exprs[0]._output_names, + ) def any_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: - return reduce(lambda x, y: x | y, parse_into_exprs(*exprs, namespace=self)) + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: ArrowDataFrame) -> list[ArrowSeries]: + series = [] + for _expr in parsed_exprs: + series.extend(list(_expr._call(df))) + return [reduce(lambda x, y: x | y, series)] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="any_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=parsed_exprs[0]._output_names, + ) def sum_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: - return reduce( - lambda x, y: x + y, - [expr.fill_null(0) for expr in parse_into_exprs(*exprs, namespace=self)], + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: ArrowDataFrame) -> list[ArrowSeries]: + series = [] + for _expr in parsed_exprs: + series.extend([_series.fill_null(0) for _series in _expr._call(df)]) + return [reduce(lambda x, y: x + y, series)] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="sum_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=parsed_exprs[0]._output_names, ) def mean_horizontal(self, *exprs: IntoArrowExpr) -> IntoArrowExpr: diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index ead7cf109..0a1314a52 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -14,6 +14,7 @@ from narwhals._dask.selectors import DaskSelectorNamespace from narwhals._dask.utils import reverse_translate_dtype from narwhals._dask.utils import validate_comparand +from narwhals._expression_parsing import combine_root_names from narwhals._expression_parsing import parse_into_exprs if TYPE_CHECKING: @@ -52,7 +53,7 @@ def __init__(self, *, backend_version: tuple[int, ...]) -> None: self._backend_version = backend_version def all(self) -> DaskExpr: - def func(df: DaskLazyFrame) -> list[Any]: + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: return [df._native_frame.loc[:, column_name] for column_name in df.columns] return DaskExpr( @@ -125,7 +126,7 @@ def len(self) -> DaskExpr: import dask.dataframe as dd # ignore-banned-import import pandas as pd # ignore-banned-import - def func(df: DaskLazyFrame) -> list[Any]: + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: if not df.columns: return [ dd.from_pandas( @@ -147,15 +148,60 @@ def func(df: DaskLazyFrame) -> list[Any]: ) def all_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: - return reduce(lambda x, y: x & y, parse_into_exprs(*exprs, namespace=self)) + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + series = [] + for _expr in parsed_exprs: + series.extend(list(_expr._call(df))) + return [reduce(lambda x, y: x & y, series).rename(series[0].name)] + + return DaskExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="all_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=parsed_exprs[0]._output_names, + returns_scalar=False, + backend_version=self._backend_version, + ) def any_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: - return reduce(lambda x, y: x | y, parse_into_exprs(*exprs, namespace=self)) + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + series = [] + for _expr in parsed_exprs: + series.extend(list(_expr._call(df))) + return [reduce(lambda x, y: x | y, series).rename(series[0].name)] + + return DaskExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="any_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=parsed_exprs[0]._output_names, + returns_scalar=False, + backend_version=self._backend_version, + ) def sum_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: - return reduce( - lambda x, y: x + y, - [expr.fill_null(0) for expr in parse_into_exprs(*exprs, namespace=self)], + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + series = [] + for _expr in parsed_exprs: + series.extend([_series.fillna(0) for _series in _expr._call(df)]) + return [reduce(lambda x, y: x + y, series).rename(series[0].name)] + + return DaskExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="sum_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=parsed_exprs[0]._output_names, + returns_scalar=False, + backend_version=self._backend_version, ) def concat( @@ -263,7 +309,7 @@ def __init__( self._otherwise_value = otherwise_value self._returns_scalar = returns_scalar - def __call__(self, df: DaskLazyFrame) -> list[Any]: + def __call__(self, df: DaskLazyFrame) -> list[dask_expr.Series]: from narwhals._dask.namespace import DaskNamespace from narwhals._expression_parsing import parse_into_expr diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index 3ccf906ff..35d0f7ec1 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -6,6 +6,7 @@ from copy import copy from typing import TYPE_CHECKING from typing import Any +from typing import Sequence from typing import TypeVar from typing import Union from typing import cast @@ -306,3 +307,15 @@ def is_simple_aggregation(expr: CompliantExpr) -> bool: because then, we can use a fastpath in pandas. """ return expr._depth < 2 + + +def combine_root_names(parsed_exprs: Sequence[CompliantExpr]) -> list[str] | None: + root_names = copy(parsed_exprs[0]._root_names) + for arg in parsed_exprs[1:]: + if root_names is not None and hasattr(arg, "__narwhals_expr__"): + if arg._root_names is not None: + root_names.extend(arg._root_names) + else: + root_names = None + break + return root_names diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 804c0e8a1..f58b07a49 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -8,6 +8,7 @@ from typing import cast from narwhals import dtypes +from narwhals._expression_parsing import combine_root_names from narwhals._expression_parsing import parse_into_exprs from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.expr import PandasLikeExpr @@ -210,16 +211,55 @@ def len(self) -> PandasLikeExpr: # --- horizontal --- def sum_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: - return reduce( - lambda x, y: x + y, - [expr.fill_null(0) for expr in parse_into_exprs(*exprs, namespace=self)], + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = [] + for _expr in parsed_exprs: + series.extend([_series.fill_null(0) for _series in _expr._call(df)]) + return [reduce(lambda x, y: x + y, series)] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="sum_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=parsed_exprs[0]._output_names, ) def all_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: - return reduce(lambda x, y: x & y, parse_into_exprs(*exprs, namespace=self)) + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = [] + for _expr in parsed_exprs: + series.extend(list(_expr._call(df))) + return [reduce(lambda x, y: x & y, series)] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="all_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=parsed_exprs[0]._output_names, + ) def any_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: - return reduce(lambda x, y: x | y, parse_into_exprs(*exprs, namespace=self)) + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = [] + for _expr in parsed_exprs: + series.extend(list(_expr._call(df))) + return [reduce(lambda x, y: x | y, series)] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="any_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=parsed_exprs[0]._output_names, + ) def mean_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: pandas_like_exprs = parse_into_exprs(*exprs, namespace=self) diff --git a/narwhals/expr.py b/narwhals/expr.py index 8ff626d52..9f5e2d571 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -4087,6 +4087,9 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 3 │ └─────┘ """ + if not exprs: + msg = "At least one expression must be passed to `sum_horizontal`" + raise ValueError(msg) return Expr( lambda plx: plx.sum_horizontal( *[extract_compliant(plx, v) for v in flatten(exprs)] @@ -4218,6 +4221,9 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ null ┆ null ┆ null │ └───────┴───────┴───────┘ """ + if not exprs: + msg = "At least one expression must be passed to `all_horizontal`" + raise ValueError(msg) return Expr( lambda plx: plx.all_horizontal( *[extract_compliant(plx, v) for v in flatten(exprs)] @@ -4331,6 +4337,9 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ null ┆ null ┆ null │ └───────┴───────┴───────┘ """ + if not exprs: + msg = "At least one expression must be passed to `any_horizontal`" + raise ValueError(msg) return Expr( lambda plx: plx.any_horizontal( *[extract_compliant(plx, v) for v in flatten(exprs)] @@ -4384,6 +4393,9 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 3.0 │ └─────┘ """ + if not exprs: + msg = "At least one expression must be passed to `mean_horizontal`" + raise ValueError(msg) return Expr( lambda plx: plx.mean_horizontal( *[extract_compliant(plx, v) for v in flatten(exprs)] diff --git a/tests/expr_and_series/all_horizontal_test.py b/tests/expr_and_series/all_horizontal_test.py index bc9f80358..98dc9f9f9 100644 --- a/tests/expr_and_series/all_horizontal_test.py +++ b/tests/expr_and_series/all_horizontal_test.py @@ -1,8 +1,10 @@ from typing import Any +import polars as pl import pytest import narwhals.stable.v1 as nw +from narwhals.utils import parse_version from tests.utils import Constructor from tests.utils import compare_dicts @@ -31,3 +33,57 @@ def test_allh_series(constructor_eager: Any) -> None: expected = {"all": [False, False, True]} compare_dicts(result, expected) + + +def test_allh_all(constructor: Constructor) -> None: + data = { + "a": [False, False, True], + "b": [False, True, True], + } + df = nw.from_native(constructor(data)) + result = df.select(all=nw.all_horizontal(nw.all())) + expected = {"all": [False, False, True]} + compare_dicts(result, expected) + result = df.select(nw.all_horizontal(nw.all())) + expected = {"a": [False, False, True]} + compare_dicts(result, expected) + + +def test_allh_nth(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "polars" in str(constructor) and parse_version(pl.__version__) < (1, 0): + request.applymarker(pytest.mark.xfail) + data = { + "a": [False, False, True], + "b": [False, True, True], + } + df = nw.from_native(constructor(data)) + result = df.select(nw.all_horizontal(nw.nth(0, 1))) + expected = {"a": [False, False, True]} + compare_dicts(result, expected) + result = df.select(nw.all_horizontal(nw.col("a"), nw.nth(0))) + expected = {"a": [False, False, True]} + compare_dicts(result, expected) + + +def test_horizontal_expressions_emtpy(constructor: Constructor) -> None: + data = { + "a": [False, False, True], + "b": [False, True, True], + } + df = nw.from_native(constructor(data)) + with pytest.raises( + ValueError, match=r"At least one expression must be passed.*all_horizontal" + ): + df.select(nw.all_horizontal()) + with pytest.raises( + ValueError, match=r"At least one expression must be passed.*any_horizontal" + ): + df.select(nw.any_horizontal()) + with pytest.raises( + ValueError, match=r"At least one expression must be passed.*mean_horizontal" + ): + df.select(nw.mean_horizontal()) + with pytest.raises( + ValueError, match=r"At least one expression must be passed.*sum_horizontal" + ): + df.select(nw.sum_horizontal()) diff --git a/tests/expr_and_series/any_horizontal_test.py b/tests/expr_and_series/any_horizontal_test.py index 1b6dfd48d..cd360bf66 100644 --- a/tests/expr_and_series/any_horizontal_test.py +++ b/tests/expr_and_series/any_horizontal_test.py @@ -19,3 +19,17 @@ def test_anyh(constructor: Constructor, expr1: Any, expr2: Any) -> None: expected = {"any": [False, True, True]} compare_dicts(result, expected) + + +def test_anyh_all(constructor: Constructor) -> None: + data = { + "a": [False, False, True], + "b": [False, True, True], + } + df = nw.from_native(constructor(data)) + result = df.select(any=nw.any_horizontal(nw.all())) + expected = {"any": [False, True, True]} + compare_dicts(result, expected) + result = df.select(nw.any_horizontal(nw.all())) + expected = {"a": [False, True, True]} + compare_dicts(result, expected) diff --git a/tests/expr_and_series/sum_horizontal_test.py b/tests/expr_and_series/sum_horizontal_test.py index bfaab7238..e9e1e4a3c 100644 --- a/tests/expr_and_series/sum_horizontal_test.py +++ b/tests/expr_and_series/sum_horizontal_test.py @@ -28,3 +28,18 @@ def test_sumh_nullable(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(hsum=nw.sum_horizontal("a", "b")) compare_dicts(result, expected) + + +def test_sumh_all(constructor: Constructor) -> None: + data = {"a": [1, 2, 3], "b": [10, 20, 30]} + df = nw.from_native(constructor(data)) + result = df.select(nw.sum_horizontal(nw.all())) + expected = { + "a": [11, 22, 33], + } + compare_dicts(result, expected) + result = df.select(c=nw.sum_horizontal(nw.all())) + expected = { + "c": [11, 22, 33], + } + compare_dicts(result, expected) From bbabe445a9f028a032269349b6949232d559b9eb Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:50:35 +0200 Subject: [PATCH 053/145] chore: `get_` cleanup (#1074) --- narwhals/_arrow/dataframe.py | 11 ++- narwhals/_arrow/namespace.py | 3 +- narwhals/_arrow/series.py | 15 ++-- narwhals/_dask/dataframe.py | 16 ++-- narwhals/_dask/expr.py | 5 +- narwhals/_dask/namespace.py | 3 +- narwhals/_pandas_like/dataframe.py | 20 ++--- narwhals/_pandas_like/namespace.py | 3 +- narwhals/_pandas_like/series.py | 20 ++--- narwhals/_pandas_like/utils.py | 118 +++++++++++++---------------- narwhals/_polars/dataframe.py | 28 +++++-- narwhals/_polars/namespace.py | 46 ++++++----- narwhals/_polars/series.py | 24 +++--- narwhals/_polars/utils.py | 7 +- narwhals/dataframe.py | 5 +- narwhals/dependencies.py | 10 +-- narwhals/series.py | 6 +- narwhals/utils.py | 3 +- 18 files changed, 193 insertions(+), 150 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 89919c763..ddcc6d9e0 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -14,7 +14,6 @@ from narwhals._arrow.utils import translate_dtype from narwhals._arrow.utils import validate_dataframe_comparand from narwhals._expression_parsing import evaluate_into_exprs -from narwhals.dependencies import get_pyarrow from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation from narwhals.utils import flatten @@ -23,6 +22,8 @@ from narwhals.utils import parse_columns_to_drop if TYPE_CHECKING: + from types import ModuleType + import numpy as np import pyarrow as pa from typing_extensions import Self @@ -48,8 +49,12 @@ def __narwhals_namespace__(self) -> ArrowNamespace: return ArrowNamespace(backend_version=self._backend_version) - def __native_namespace__(self) -> Any: - return get_pyarrow() + def __native_namespace__(self: Self) -> ModuleType: + if self._implementation is Implementation.PYARROW: + return self._implementation.to_native_namespace() + + msg = f"Expected pyarrow, got: {type(self._implementation)}" # pragma: no cover + raise AssertionError(msg) def __narwhals_dataframe__(self) -> Self: return self diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index e7cbbae2e..5a608f12e 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable +from typing import Literal from typing import cast from narwhals import dtypes @@ -239,7 +240,7 @@ def concat( self, items: Iterable[ArrowDataFrame], *, - how: str = "vertical", + how: Literal["horizontal", "vertical"], ) -> ArrowDataFrame: dfs: list[Any] = [item._native_frame for item in items] diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 8c8643791..f3888bc53 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -13,12 +13,12 @@ from narwhals._arrow.utils import narwhals_to_native_dtype from narwhals._arrow.utils import translate_dtype from narwhals._arrow.utils import validate_column_comparand -from narwhals.dependencies import get_pandas -from narwhals.dependencies import get_pyarrow from narwhals.utils import Implementation from narwhals.utils import generate_unique_token if TYPE_CHECKING: + from types import ModuleType + import pyarrow as pa from typing_extensions import Self @@ -303,8 +303,12 @@ def n_unique(self) -> int: unique_values = pc.unique(self._native_series) return pc.count(unique_values, mode="all") # type: ignore[no-any-return] - def __native_namespace__(self) -> Any: # pragma: no cover - return get_pyarrow() + def __native_namespace__(self: Self) -> ModuleType: + if self._implementation is Implementation.PYARROW: + return self._implementation.to_native_namespace() + + msg = f"Expected pyarrow, got: {type(self._implementation)}" # pragma: no cover + raise AssertionError(msg) @property def name(self) -> str: @@ -573,7 +577,8 @@ def to_frame(self: Self) -> ArrowDataFrame: return ArrowDataFrame(df, backend_version=self._backend_version) def to_pandas(self: Self) -> Any: - pd = get_pandas() + import pandas as pd # ignore-banned-import() + return pd.Series(self._native_series, name=self.name) def is_duplicated(self: Self) -> ArrowSeries: diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index ac10ac2b8..a07db4cec 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -9,8 +9,6 @@ from narwhals._dask.utils import add_row_index from narwhals._dask.utils import parse_exprs_and_named_exprs from narwhals._pandas_like.utils import translate_dtype -from narwhals.dependencies import get_dask_dataframe -from narwhals.dependencies import get_pandas from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_unique_token @@ -18,6 +16,8 @@ from narwhals.utils import parse_version if TYPE_CHECKING: + from types import ModuleType + import dask.dataframe as dd from typing_extensions import Self @@ -36,8 +36,12 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.DASK - def __native_namespace__(self) -> Any: # pragma: no cover - return get_dask_dataframe() + def __native_namespace__(self: Self) -> ModuleType: + if self._implementation is Implementation.DASK: + return self._implementation.to_native_namespace() + + msg = f"Expected dask, got: {type(self._implementation)}" # pragma: no cover + raise AssertionError(msg) def __narwhals_namespace__(self) -> DaskNamespace: from narwhals._dask.namespace import DaskNamespace @@ -57,13 +61,15 @@ def with_columns(self, *exprs: DaskExpr, **named_exprs: DaskExpr) -> Self: return self._from_native_frame(df) def collect(self) -> Any: + import pandas as pd # ignore-banned-import() + from narwhals._pandas_like.dataframe import PandasLikeDataFrame result = self._native_frame.compute() return PandasLikeDataFrame( result, implementation=Implementation.PANDAS, - backend_version=parse_version(get_pandas().__version__), + backend_version=parse_version(pd.__version__), ) @property diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index e1fb5f87f..a6eb17566 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -10,7 +10,6 @@ from narwhals._dask.utils import add_row_index from narwhals._dask.utils import maybe_evaluate from narwhals._dask.utils import reverse_translate_dtype -from narwhals.dependencies import get_dask from narwhals.utils import generate_unique_token if TYPE_CHECKING: @@ -803,8 +802,10 @@ def slice(self, offset: int, length: int | None = None) -> DaskExpr: ) def to_datetime(self, format: str | None = None) -> DaskExpr: # noqa: A002 + import dask.dataframe as dd # ignore-banned-import() + return self._expr._from_call( - lambda _input, fmt: get_dask().dataframe.to_datetime(_input, format=fmt), + lambda _input, fmt: dd.to_datetime(_input, format=fmt), "to_datetime", format, returns_scalar=False, diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 0a1314a52..a683b39b1 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -5,6 +5,7 @@ from typing import Any from typing import Callable from typing import Iterable +from typing import Literal from typing import NoReturn from typing import cast @@ -208,7 +209,7 @@ def concat( self, items: Iterable[DaskLazyFrame], *, - how: str = "vertical", + how: Literal["horizontal", "vertical"], ) -> DaskLazyFrame: import dask.dataframe as dd # ignore-banned-import diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 097429724..eb33b4fdf 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -16,9 +16,6 @@ from narwhals._pandas_like.utils import horizontal_concat from narwhals._pandas_like.utils import translate_dtype from narwhals._pandas_like.utils import validate_dataframe_comparand -from narwhals.dependencies import get_cudf -from narwhals.dependencies import get_modin -from narwhals.dependencies import get_pandas from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation from narwhals.utils import flatten @@ -27,6 +24,8 @@ from narwhals.utils import parse_columns_to_drop if TYPE_CHECKING: + from types import ModuleType + import numpy as np import pandas as pd from typing_extensions import Self @@ -63,13 +62,14 @@ def __narwhals_namespace__(self) -> PandasLikeNamespace: return PandasLikeNamespace(self._implementation, self._backend_version) - def __native_namespace__(self) -> Any: - if self._implementation is Implementation.PANDAS: - return get_pandas() - if self._implementation is Implementation.MODIN: # pragma: no cover - return get_modin() - if self._implementation is Implementation.CUDF: # pragma: no cover - return get_cudf() + def __native_namespace__(self: Self) -> ModuleType: + if self._implementation in { + Implementation.PANDAS, + Implementation.MODIN, + Implementation.CUDF, + }: + return self._implementation.to_native_namespace() + msg = f"Expected pandas/modin/cudf, got: {type(self._implementation)}" # pragma: no cover raise AssertionError(msg) diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index f58b07a49..b60d0dcce 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -5,6 +5,7 @@ from typing import Any from typing import Callable from typing import Iterable +from typing import Literal from typing import cast from narwhals import dtypes @@ -273,7 +274,7 @@ def concat( self, items: Iterable[PandasLikeDataFrame], *, - how: str = "vertical", + how: Literal["horizontal", "vertical"], ) -> PandasLikeDataFrame: dfs: list[Any] = [item._native_frame for item in items] if how == "horizontal": diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 873683dd5..d531b4641 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -15,12 +15,11 @@ from narwhals._pandas_like.utils import to_datetime from narwhals._pandas_like.utils import translate_dtype from narwhals._pandas_like.utils import validate_column_comparand -from narwhals.dependencies import get_cudf -from narwhals.dependencies import get_modin -from narwhals.dependencies import get_pandas from narwhals.utils import Implementation if TYPE_CHECKING: + from types import ModuleType + from typing_extensions import Self from narwhals._pandas_like.dataframe import PandasLikeDataFrame @@ -97,13 +96,14 @@ def __init__( else: self._use_copy_false = False - def __native_namespace__(self) -> Any: - if self._implementation is Implementation.PANDAS: - return get_pandas() - if self._implementation is Implementation.MODIN: # pragma: no cover - return get_modin() - if self._implementation is Implementation.CUDF: # pragma: no cover - return get_cudf() + def __native_namespace__(self: Self) -> ModuleType: + if self._implementation in { + Implementation.PANDAS, + Implementation.MODIN, + Implementation.CUDF, + }: + return self._implementation.to_native_namespace() + msg = f"Expected pandas/modin/cudf, got: {type(self._implementation)}" # pragma: no cover raise AssertionError(msg) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index a65e361a3..aadb438e2 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -5,9 +5,6 @@ from typing import Iterable from typing import TypeVar -from narwhals.dependencies import get_cudf -from narwhals.dependencies import get_modin -from narwhals.dependencies import get_pandas from narwhals.utils import Implementation from narwhals.utils import isinstance_or_issubclass @@ -22,6 +19,13 @@ import pandas as pd +PANDAS_LIKE_IMPLEMENTATION = { + Implementation.PANDAS, + Implementation.CUDF, + Implementation.MODIN, +} + + def validate_column_comparand(index: Any, other: Any) -> Any: """Validate RHS of binary operation. @@ -93,18 +97,16 @@ def create_native_series( ) -> PandasLikeSeries: from narwhals._pandas_like.series import PandasLikeSeries - if implementation is Implementation.PANDAS: - pd = get_pandas() - series = pd.Series(iterable, index=index, name="") - elif implementation is Implementation.MODIN: - mpd = get_modin() - series = mpd.Series(iterable, index=index, name="") - elif implementation is Implementation.CUDF: - cudf = get_cudf() - series = cudf.Series(iterable, index=index, name="") - return PandasLikeSeries( - series, implementation=implementation, backend_version=backend_version - ) + if implementation in PANDAS_LIKE_IMPLEMENTATION: + series = implementation.to_native_namespace().Series( + iterable, index=index, name="" + ) + return PandasLikeSeries( + series, implementation=implementation, backend_version=backend_version + ) + else: # pragma: no cover + msg = f"Expected pandas-like implementation ({PANDAS_LIKE_IMPLEMENTATION}), found {implementation}" + raise TypeError(msg) def horizontal_concat( @@ -115,22 +117,17 @@ def horizontal_concat( Should be in namespace. """ - if implementation is Implementation.PANDAS: - pd = get_pandas() - - if backend_version < (3,): - return pd.concat(dfs, axis=1, copy=False) - return pd.concat(dfs, axis=1) # pragma: no cover - if implementation is Implementation.CUDF: # pragma: no cover - cudf = get_cudf() - - return cudf.concat(dfs, axis=1) - if implementation is Implementation.MODIN: # pragma: no cover - mpd = get_modin() + if implementation in PANDAS_LIKE_IMPLEMENTATION: + extra_kwargs = ( + {"copy": False} + if implementation is Implementation.PANDAS and backend_version < (3,) + else {} + ) + return implementation.to_native_namespace().concat(dfs, axis=1, **extra_kwargs) - return mpd.concat(dfs, axis=1) - msg = f"Unknown implementation: {implementation}" # pragma: no cover - raise TypeError(msg) # pragma: no cover + else: # pragma: no cover + msg = f"Expected pandas-like implementation ({PANDAS_LIKE_IMPLEMENTATION}), found {implementation}" + raise TypeError(msg) def vertical_concat( @@ -150,22 +147,18 @@ def vertical_concat( if cols_current != cols: msg = "unable to vstack, column names don't match" raise TypeError(msg) - if implementation is Implementation.PANDAS: - pd = get_pandas() - - if backend_version < (3,): - return pd.concat(dfs, axis=0, copy=False) - return pd.concat(dfs, axis=0) # pragma: no cover - if implementation is Implementation.CUDF: # pragma: no cover - cudf = get_cudf() - return cudf.concat(dfs, axis=0) - if implementation is Implementation.MODIN: # pragma: no cover - mpd = get_modin() + if implementation in PANDAS_LIKE_IMPLEMENTATION: + extra_kwargs = ( + {"copy": False} + if implementation is Implementation.PANDAS and backend_version < (3,) + else {} + ) + return implementation.to_native_namespace().concat(dfs, axis=0, **extra_kwargs) - return mpd.concat(dfs, axis=0) - msg = f"Unknown implementation: {implementation}" # pragma: no cover - raise TypeError(msg) # pragma: no cover + else: # pragma: no cover + msg = f"Expected pandas-like implementation ({PANDAS_LIKE_IMPLEMENTATION}), found {implementation}" + raise TypeError(msg) def native_series_from_iterable( @@ -175,20 +168,15 @@ def native_series_from_iterable( implementation: Implementation, ) -> Any: """Return native series.""" - if implementation is Implementation.PANDAS: - pd = get_pandas() - - return pd.Series(data, name=name, index=index, copy=False) - if implementation is Implementation.CUDF: # pragma: no cover - cudf = get_cudf() - - return cudf.Series(data, name=name, index=index) - if implementation is Implementation.MODIN: # pragma: no cover - mpd = get_modin() + if implementation in PANDAS_LIKE_IMPLEMENTATION: + extra_kwargs = {"copy": False} if implementation is Implementation.PANDAS else {} + return implementation.to_native_namespace().Series( + data, name=name, index=index, **extra_kwargs + ) - return mpd.Series(data, name=name, index=index) - msg = f"Unknown implementation: {implementation}" # pragma: no cover - raise TypeError(msg) # pragma: no cover + else: # pragma: no cover + msg = f"Expected pandas-like implementation ({PANDAS_LIKE_IMPLEMENTATION}), found {implementation}" + raise TypeError(msg) def set_axis( @@ -297,7 +285,8 @@ def translate_dtype(column: Any) -> DType: def get_dtype_backend(dtype: Any, implementation: Implementation) -> str: if implementation is Implementation.PANDAS: - pd = get_pandas() + import pandas as pd # ignore-banned-import() + if hasattr(pd, "ArrowDtype") and isinstance(dtype, pd.ArrowDtype): return "pyarrow-nullable" @@ -474,13 +463,12 @@ def broadcast_series(series: list[PandasLikeSeries]) -> list[Any]: def to_datetime(implementation: Implementation) -> Any: - if implementation is Implementation.PANDAS: - return get_pandas().to_datetime - if implementation is Implementation.MODIN: - return get_modin().to_datetime - if implementation is Implementation.CUDF: - return get_cudf().to_datetime - raise AssertionError + if implementation in PANDAS_LIKE_IMPLEMENTATION: + return implementation.to_native_namespace().to_datetime + + else: # pragma: no cover + msg = f"Expected pandas-like implementation ({PANDAS_LIKE_IMPLEMENTATION}), found {implementation}" + raise TypeError(msg) def int_dtype_mapper(dtype: Any) -> str: diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 77d8a016b..1947a5aee 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -7,12 +7,13 @@ from narwhals._polars.utils import convert_str_slice_to_int_slice from narwhals._polars.utils import extract_args_kwargs from narwhals._polars.utils import translate_dtype -from narwhals.dependencies import get_polars from narwhals.utils import Implementation from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop if TYPE_CHECKING: + from types import ModuleType + import numpy as np from typing_extensions import Self @@ -20,8 +21,8 @@ class PolarsDataFrame: def __init__(self, df: Any, *, backend_version: tuple[int, ...]) -> None: self._native_frame = df - self._implementation = Implementation.POLARS self._backend_version = backend_version + self._implementation = Implementation.POLARS def __repr__(self) -> str: # pragma: no cover return "PolarsDataFrame" @@ -32,14 +33,19 @@ def __narwhals_dataframe__(self) -> Self: def __narwhals_namespace__(self) -> PolarsNamespace: return PolarsNamespace(backend_version=self._backend_version) - def __native_namespace__(self) -> Any: - return get_polars() + def __native_namespace__(self: Self) -> ModuleType: + if self._implementation is Implementation.POLARS: + return self._implementation.to_native_namespace() + + msg = f"Expected polars, got: {type(self._implementation)}" # pragma: no cover + raise AssertionError(msg) def _from_native_frame(self, df: Any) -> Self: return self.__class__(df, backend_version=self._backend_version) def _from_native_object(self, obj: Any) -> Any: - pl = get_polars() + import polars as pl # ignore-banned-import() + if isinstance(obj, pl.Series): from narwhals._polars.series import PolarsSeries @@ -111,7 +117,8 @@ def __getitem__(self, item: Any) -> Any: ) msg = f"Expected slice of integers or strings, got: {type(item[1])}" # pragma: no cover raise TypeError(msg) # pragma: no cover - pl = get_polars() + import polars as pl # ignore-banned-import() + if ( isinstance(item, tuple) and (len(item) == 2) @@ -191,6 +198,7 @@ class PolarsLazyFrame: def __init__(self, df: Any, *, backend_version: tuple[int, ...]) -> None: self._native_frame = df self._backend_version = backend_version + self._implementation = Implementation.POLARS def __repr__(self) -> str: # pragma: no cover return "PolarsLazyFrame" @@ -201,8 +209,12 @@ def __narwhals_lazyframe__(self) -> Self: def __narwhals_namespace__(self) -> PolarsNamespace: return PolarsNamespace(backend_version=self._backend_version) - def __native_namespace__(self) -> Any: # pragma: no cover - return get_polars() + def __native_namespace__(self: Self) -> ModuleType: + if self._implementation is Implementation.POLARS: + return self._implementation.to_native_namespace() + + msg = f"Expected polars, got: {type(self._implementation)}" # pragma: no cover + raise AssertionError(msg) def _from_native_frame(self, df: Any) -> Self: return self.__class__(df, backend_version=self._backend_version) diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index be9cf7a8c..f25e8b81f 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -4,13 +4,13 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable +from typing import Literal from typing import Sequence from narwhals import dtypes from narwhals._expression_parsing import parse_into_exprs from narwhals._polars.utils import extract_args_kwargs from narwhals._polars.utils import narwhals_to_native_dtype -from narwhals.dependencies import get_polars from narwhals.utils import Implementation if TYPE_CHECKING: @@ -46,9 +46,9 @@ def __init__(self, *, backend_version: tuple[int, ...]) -> None: self._implementation = Implementation.POLARS def __getattr__(self, attr: str) -> Any: - from narwhals._polars.expr import PolarsExpr + import polars as pl # ignore-banned-import - pl = get_polars() + from narwhals._polars.expr import PolarsExpr def func(*args: Any, **kwargs: Any) -> Any: args, kwargs = extract_args_kwargs(args, kwargs) # type: ignore[assignment] @@ -57,18 +57,20 @@ def func(*args: Any, **kwargs: Any) -> Any: return func def nth(self, *indices: int) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() if self._backend_version < (1, 0, 0): # pragma: no cover msg = "`nth` is only supported for Polars>=1.0.0. Please use `col` for columns selection instead." raise AttributeError(msg) return PolarsExpr(pl.nth(*indices)) def len(self) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() if self._backend_version < (0, 20, 5): # pragma: no cover return PolarsExpr(pl.count().alias("len")) return PolarsExpr(pl.len()) @@ -77,12 +79,13 @@ def concat( self, items: Sequence[PolarsDataFrame | PolarsLazyFrame], *, - how: str = "vertical", + how: Literal["vertical", "horizontal"], ) -> PolarsDataFrame | PolarsLazyFrame: + import polars as pl # ignore-banned-import() + from narwhals._polars.dataframe import PolarsDataFrame from narwhals._polars.dataframe import PolarsLazyFrame - pl = get_polars() dfs: list[Any] = [item._native_frame for item in items] result = pl.concat(dfs, how=how) if isinstance(result, pl.DataFrame): @@ -90,25 +93,28 @@ def concat( return PolarsLazyFrame(result, backend_version=items[0]._backend_version) def lit(self, value: Any, dtype: dtypes.DType | None = None) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() if dtype is not None: return PolarsExpr(pl.lit(value, dtype=narwhals_to_native_dtype(dtype))) return PolarsExpr(pl.lit(value)) def mean(self, *column_names: str) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() if self._backend_version < (0, 20, 4): # pragma: no cover - return PolarsExpr(pl.mean([*column_names])) + return PolarsExpr(pl.mean([*column_names])) # type: ignore[arg-type] return PolarsExpr(pl.mean(*column_names)) def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() polars_exprs = parse_into_exprs(*exprs, namespace=self) if self._backend_version < (0, 20, 8): # pragma: no cover @@ -127,39 +133,45 @@ def selectors(self) -> PolarsSelectors: class PolarsSelectors: def by_dtype(self, dtypes: Iterable[dtypes.DType]) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() return PolarsExpr( pl.selectors.by_dtype([narwhals_to_native_dtype(dtype) for dtype in dtypes]) ) def numeric(self) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() return PolarsExpr(pl.selectors.numeric()) def boolean(self) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() return PolarsExpr(pl.selectors.boolean()) def string(self) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() return PolarsExpr(pl.selectors.string()) def categorical(self) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() return PolarsExpr(pl.selectors.categorical()) def all(self) -> PolarsExpr: + import polars as pl # ignore-banned-import() + from narwhals._polars.expr import PolarsExpr - pl = get_polars() return PolarsExpr(pl.selectors.all()) diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 582ab75dc..e8fa28195 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -7,10 +7,11 @@ from narwhals._polars.utils import extract_args_kwargs from narwhals._polars.utils import extract_native -from narwhals.dependencies import get_polars from narwhals.utils import Implementation if TYPE_CHECKING: + from types import ModuleType + import numpy as np from typing_extensions import Self @@ -20,14 +21,12 @@ from narwhals._polars.utils import narwhals_to_native_dtype from narwhals._polars.utils import translate_dtype -PL = get_polars() - class PolarsSeries: def __init__(self, series: Any, *, backend_version: tuple[int, ...]) -> None: self._native_series = series - self._implementation = Implementation.POLARS self._backend_version = backend_version + self._implementation = Implementation.POLARS def __repr__(self) -> str: # pragma: no cover return "PolarsSeries" @@ -35,14 +34,19 @@ def __repr__(self) -> str: # pragma: no cover def __narwhals_series__(self) -> Self: return self - def __native_namespace__(self) -> Any: - return get_polars() + def __native_namespace__(self: Self) -> ModuleType: + if self._implementation is Implementation.POLARS: + return self._implementation.to_native_namespace() + + msg = f"Expected polars, got: {type(self._implementation)}" # pragma: no cover + raise AssertionError(msg) def _from_native_series(self, series: Any) -> Self: return self.__class__(series, backend_version=self._backend_version) def _from_native_object(self, series: Any) -> Any: - pl = get_polars() + import polars as pl # ignore-banned-import() + if isinstance(series, pl.Series): return self._from_native_series(series) if isinstance(series, pl.DataFrame): @@ -187,7 +191,8 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: result = self._native_series.sort(descending=descending) if nulls_last: - pl = get_polars() + import polars as pl # ignore-banned-import() + is_null = result.is_null() result = pl.concat([result.filter(~is_null), result.filter(is_null)]) else: @@ -208,7 +213,8 @@ def value_counts( from narwhals._polars.dataframe import PolarsDataFrame if self._backend_version < (1, 0, 0): # pragma: no cover - pl = get_polars() + import polars as pl # ignore-banned-import() + value_name_ = name or ("proportion" if normalize else "count") result = self._native_series.value_counts(sort=sort, parallel=parallel) diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 46f399a85..e6fb36859 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -3,7 +3,6 @@ from typing import Any from narwhals import dtypes -from narwhals.dependencies import get_polars def extract_native(obj: Any) -> Any: @@ -28,7 +27,8 @@ def extract_args_kwargs(args: Any, kwargs: Any) -> tuple[list[Any], dict[str, An def translate_dtype(dtype: Any) -> dtypes.DType: - pl = get_polars() + import polars as pl # ignore-banned-import() + if dtype == pl.Float64: return dtypes.Float64() if dtype == pl.Float32: @@ -69,7 +69,8 @@ def translate_dtype(dtype: Any) -> dtypes.DType: def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: - pl = get_polars() + import polars as pl # ignore-banned-import() + from narwhals import dtypes if dtype == dtypes.Float64: diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 581c8617d..3cdd972e2 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -22,6 +22,7 @@ if TYPE_CHECKING: from io import BytesIO from pathlib import Path + from types import ModuleType import numpy as np import pandas as pd @@ -41,8 +42,8 @@ class BaseFrame(Generic[FrameT]): _compliant_frame: Any _level: Literal["full", "interchange"] - def __native_namespace__(self) -> Any: - return self._compliant_frame.__native_namespace__() + def __native_namespace__(self: Self) -> ModuleType: + return self._compliant_frame.__native_namespace__() # type: ignore[no-any-return] def __narwhals_namespace__(self) -> Any: return self._compliant_frame.__narwhals_namespace__() diff --git a/narwhals/dependencies.py b/narwhals/dependencies.py index 2cd9f0983..144c57c8a 100644 --- a/narwhals/dependencies.py +++ b/narwhals/dependencies.py @@ -93,22 +93,22 @@ def is_pandas_series(ser: Any) -> TypeGuard[pd.Series[Any]]: def is_modin_dataframe(df: Any) -> TypeGuard[mpd.DataFrame]: """Check whether `df` is a modin DataFrame without importing modin.""" - return (pd := get_modin()) is not None and isinstance(df, pd.DataFrame) + return (mpd := get_modin()) is not None and isinstance(df, mpd.DataFrame) def is_modin_series(ser: Any) -> TypeGuard[mpd.Series]: """Check whether `ser` is a modin Series without importing modin.""" - return (pd := get_modin()) is not None and isinstance(ser, pd.Series) + return (mpd := get_modin()) is not None and isinstance(ser, mpd.Series) def is_cudf_dataframe(df: Any) -> TypeGuard[cudf.DataFrame]: """Check whether `df` is a cudf DataFrame without importing cudf.""" - return (pd := get_cudf()) is not None and isinstance(df, pd.DataFrame) + return (cudf := get_cudf()) is not None and isinstance(df, cudf.DataFrame) -def is_cudf_series(ser: Any) -> TypeGuard[pd.Series[Any]]: +def is_cudf_series(ser: Any) -> TypeGuard[cudf.Series[Any]]: """Check whether `ser` is a cudf Series without importing cudf.""" - return (pd := get_cudf()) is not None and isinstance(ser, pd.Series) + return (cudf := get_cudf()) is not None and isinstance(ser, cudf.Series) def is_dask_dataframe(df: Any) -> TypeGuard[dd.DataFrame]: diff --git a/narwhals/series.py b/narwhals/series.py index ab8b012b8..bd99fc5c7 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -11,6 +11,8 @@ from narwhals.utils import parse_version if TYPE_CHECKING: + from types import ModuleType + import numpy as np import pandas as pd import pyarrow as pa @@ -58,8 +60,8 @@ def __getitem__(self, idx: int | slice | Sequence[int]) -> Any | Self: return self._compliant_series[idx] return self._from_compliant_series(self._compliant_series[idx]) - def __native_namespace__(self) -> Any: - return self._compliant_series.__native_namespace__() + def __native_namespace__(self: Self) -> ModuleType: + return self._compliant_series.__native_namespace__() # type: ignore[no-any-return] def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: """ diff --git a/narwhals/utils.py b/narwhals/utils.py index 15919a922..0d9503240 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -66,7 +66,7 @@ def from_native_namespace( } return mapping.get(native_namespace, Implementation.UNKNOWN) - def to_native_namespace(self: Self) -> ModuleType: # pragma: no cover + def to_native_namespace(self: Self) -> ModuleType: """Return the native namespace module corresponding to Implementation.""" mapping = { Implementation.PANDAS: get_pandas(), @@ -74,6 +74,7 @@ def to_native_namespace(self: Self) -> ModuleType: # pragma: no cover Implementation.CUDF: get_cudf(), Implementation.PYARROW: get_pyarrow(), Implementation.POLARS: get_polars(), + Implementation.DASK: get_dask_dataframe(), } return mapping[self] # type: ignore[no-any-return] From 359060ce56b2c579251a05419d5d53adfaf9e647 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Fri, 27 Sep 2024 14:11:46 +0200 Subject: [PATCH 054/145] feat: add `to_pandas` and `to_arrow` for interchange level of support (#1066) --- .github/workflows/pytest.yml | 4 +- narwhals/_duckdb/dataframe.py | 17 +++++++++ narwhals/_ibis/dataframe.py | 10 +++++ narwhals/_interchange/dataframe.py | 22 +++++++++++ noxfile.py | 4 +- tests/frame/interchange_to_arrow_test.py | 41 +++++++++++++++++++++ tests/frame/interchange_to_pandas_test.py | 45 +++++++++++++++++++++++ 7 files changed, 140 insertions(+), 3 deletions(-) create mode 100644 tests/frame/interchange_to_arrow_test.py create mode 100644 tests/frame/interchange_to_pandas_test.py diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 179891ddf..bf4f7d39a 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -25,7 +25,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "**requirements*.txt" - name: install-reqs - run: uv pip install --upgrade tox virtualenv setuptools -r requirements-dev.txt ibis-framework[duckdb] --system + run: uv pip install --upgrade tox virtualenv setuptools -r requirements-dev.txt --system - name: show-deps run: uv pip freeze - name: Run pytest @@ -87,7 +87,7 @@ jobs: - name: show-deps run: uv pip freeze - name: install ibis - run: uv pip install ibis-framework[duckdb] --system + run: uv pip install ibis-framework[duckdb]>=6.0.0 --system # Ibis puts upper bounds on dependencies, and requires Python3.10+, # which messes with other dependencies on lower Python versions if: matrix.python-version == '3.12' diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 8de244658..2263c3fc7 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -4,8 +4,13 @@ from typing import Any from narwhals import dtypes +from narwhals.utils import parse_version if TYPE_CHECKING: + import pandas as pd + import pyarrow as pa + from typing_extensions import Self + from narwhals._duckdb.series import DuckDBInterchangeSeries @@ -73,3 +78,15 @@ def __getattr__(self, attr: str) -> Any: "at https://github.com/narwhals-dev/narwhals/issues." ) raise NotImplementedError(msg) # pragma: no cover + + def to_pandas(self: Self) -> pd.DataFrame: + import pandas as pd # ignore-banned-import() + + if parse_version(pd.__version__) >= parse_version("1.0.0"): + return self._native_frame.df() + else: # pragma: no cover + msg = f"Conversion to pandas requires pandas>=1.0.0, found {pd.__version__}" + raise NotImplementedError(msg) + + def to_arrow(self: Self) -> pa.Table: + return self._native_frame.arrow() diff --git a/narwhals/_ibis/dataframe.py b/narwhals/_ibis/dataframe.py index e2baa4ec4..fb7bedbf1 100644 --- a/narwhals/_ibis/dataframe.py +++ b/narwhals/_ibis/dataframe.py @@ -6,6 +6,10 @@ from narwhals import dtypes if TYPE_CHECKING: + import pandas as pd + import pyarrow as pa + from typing_extensions import Self + from narwhals._ibis.series import IbisInterchangeSeries @@ -55,6 +59,12 @@ def __getitem__(self, item: str) -> IbisInterchangeSeries: return IbisInterchangeSeries(self._native_frame[item]) + def to_pandas(self: Self) -> pd.DataFrame: + return self._native_frame.to_pandas() + + def to_arrow(self: Self) -> pa.Table: + return self._native_frame.to_pyarrow() + def __getattr__(self, attr: str) -> Any: if attr == "schema": return { diff --git a/narwhals/_interchange/dataframe.py b/narwhals/_interchange/dataframe.py index 2e9775258..975da216f 100644 --- a/narwhals/_interchange/dataframe.py +++ b/narwhals/_interchange/dataframe.py @@ -6,8 +6,13 @@ from typing import NoReturn from narwhals import dtypes +from narwhals.utils import parse_version if TYPE_CHECKING: + import pandas as pd + import pyarrow as pa + from typing_extensions import Self + from narwhals._interchange.series import InterchangeSeries @@ -89,6 +94,23 @@ def schema(self) -> dict[str, dtypes.DType]: for column_name in self._interchange_frame.column_names() } + def to_pandas(self: Self) -> pd.DataFrame: + import pandas as pd # ignore-banned-import() + + if parse_version(pd.__version__) >= parse_version("1.5.0"): + return pd.api.interchange.from_dataframe(self._native_frame) + else: # pragma: no cover + msg = ( + "Conversion to pandas is achieved via interchange protocol which requires" + f" pandas>=1.5.0 to be installed, found {pd.__version__}" + ) + raise NotImplementedError(msg) + + def to_arrow(self: Self) -> pa.Table: + from pyarrow.interchange import from_dataframe # ignore-banned-import() + + return from_dataframe(self._native_frame) + def __getattr__(self, attr: str) -> NoReturn: msg = ( f"Attribute {attr} is not supported for metadata-only dataframes.\n\n" diff --git a/noxfile.py b/noxfile.py index 06cdc0284..1dc37b29d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -10,6 +10,9 @@ def run_common(session: Session, coverage_threshold: float) -> None: session.install("-e.", "-r", "requirements-dev.txt") + if session.python != "3.8": + session.install("ibis-framework[duckdb]>=6.0.0") + session.run( "pytest", "tests", @@ -73,5 +76,4 @@ def nightly_versions(session: Session) -> None: "git+https://github.com/dask/dask", "git+https://github.com/dask/dask-expr", ) - run_common(session, coverage_threshold=50) diff --git a/tests/frame/interchange_to_arrow_test.py b/tests/frame/interchange_to_arrow_test.py new file mode 100644 index 000000000..7308607ea --- /dev/null +++ b/tests/frame/interchange_to_arrow_test.py @@ -0,0 +1,41 @@ +import duckdb +import polars as pl +import pyarrow as pa +import pytest + +import narwhals.stable.v1 as nw + +data = {"a": [1, 2, 3], "b": [4.0, 5.0, 6.1], "z": ["x", "y", "z"]} + + +def test_interchange_to_arrow() -> None: + df_pl = pl.DataFrame(data) + df = nw.from_native(df_pl.__dataframe__(), eager_or_interchange_only=True) + result = df.to_arrow() + + assert isinstance(result, pa.Table) + + +def test_interchange_ibis_to_arrow( + tmpdir: pytest.TempdirFactory, +) -> None: # pragma: no cover + ibis = pytest.importorskip("ibis") + df_pl = pl.DataFrame(data) + + filepath = str(tmpdir / "file.parquet") # type: ignore[operator] + df_pl.write_parquet(filepath) + + tbl = ibis.read_parquet(filepath) + df = nw.from_native(tbl, eager_or_interchange_only=True) + result = df.to_arrow() + + assert isinstance(result, pa.Table) + + +def test_interchange_duckdb_to_arrow() -> None: + df_pl = pl.DataFrame(data) # noqa: F841 + rel = duckdb.sql("select * from df_pl") + df = nw.from_native(rel, eager_or_interchange_only=True) + result = df.to_arrow() + + assert isinstance(result, pa.Table) diff --git a/tests/frame/interchange_to_pandas_test.py b/tests/frame/interchange_to_pandas_test.py new file mode 100644 index 000000000..f56575fa3 --- /dev/null +++ b/tests/frame/interchange_to_pandas_test.py @@ -0,0 +1,45 @@ +import duckdb +import pandas as pd +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version + +data = {"a": [1, 2, 3], "b": [4.0, 5.0, 6.0], "z": ["x", "y", "z"]} + + +def test_interchange_to_pandas(request: pytest.FixtureRequest) -> None: + if parse_version(pd.__version__) < parse_version("1.5.0"): + request.applymarker(pytest.mark.xfail) + df_raw = pd.DataFrame(data) + df = nw.from_native(df_raw.__dataframe__(), eager_or_interchange_only=True) + + assert df.to_pandas().equals(df_raw) + + +def test_interchange_ibis_to_pandas( + tmpdir: pytest.TempdirFactory, request: pytest.FixtureRequest +) -> None: # pragma: no cover + if parse_version(pd.__version__) < parse_version("1.5.0"): + request.applymarker(pytest.mark.xfail) + + ibis = pytest.importorskip("ibis") + df_raw = pd.DataFrame(data) + + filepath = str(tmpdir / "file.parquet") # type: ignore[operator] + df_raw.to_parquet(filepath) + + tbl = ibis.read_parquet(filepath) + df = nw.from_native(tbl, eager_or_interchange_only=True) + + assert df.to_pandas().equals(df_raw) + + +def test_interchange_duckdb_to_pandas(request: pytest.FixtureRequest) -> None: + if parse_version(pd.__version__) < parse_version("1.0.0"): + request.applymarker(pytest.mark.xfail) + df_raw = pd.DataFrame(data) + rel = duckdb.sql("select * from df_raw") + df = nw.from_native(rel, eager_or_interchange_only=True) + + assert df.to_pandas().equals(df_raw) From a9963a69c55a378117f6d829e533ca44eb930655 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Fri, 27 Sep 2024 14:55:11 +0200 Subject: [PATCH 055/145] feat: add `DataFrame|LazyFrame.unpivot` method (#1043) --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- docs/api-reference/dataframe.md | 1 + docs/api-reference/lazyframe.md | 1 + narwhals/_arrow/dataframe.py | 42 +++++++ narwhals/_dask/dataframe.py | 16 +++ narwhals/_pandas_like/dataframe.py | 16 +++ narwhals/_polars/dataframe.py | 44 ++++++++ narwhals/dataframe.py | 170 +++++++++++++++++++++++++++++ tests/frame/unpivot_test.py | 67 ++++++++++++ 8 files changed, 357 insertions(+) create mode 100644 tests/frame/unpivot_test.py diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index f593293f4..447acbc15 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -41,6 +41,7 @@ - to_numpy - to_pandas - unique + - unpivot - with_columns - with_row_index - write_csv diff --git a/docs/api-reference/lazyframe.md b/docs/api-reference/lazyframe.md index 89a8282ac..a6776e08c 100644 --- a/docs/api-reference/lazyframe.md +++ b/docs/api-reference/lazyframe.md @@ -25,6 +25,7 @@ - tail - to_native - unique + - unpivot - with_columns - with_row_index show_root_heading: false diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index ddcc6d9e0..717238643 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -620,3 +620,45 @@ def sample( mask = rng.choice(idx, size=n, replace=with_replacement) return self._from_native_frame(pc.take(frame, mask)) + + def unpivot( + self: Self, + on: str | list[str] | None, + index: str | list[str] | None, + variable_name: str | None, + value_name: str | None, + ) -> Self: + import pyarrow as pa # ignore-banned-import + + native_frame = self._native_frame + variable_name = variable_name if variable_name is not None else "variable" + value_name = value_name if value_name is not None else "value" + + index_: list[str] = ( + [] if index is None else [index] if isinstance(index, str) else index + ) + on_: list[str] = ( + [c for c in self.columns if c not in index_] + if on is None + else [on] + if isinstance(on, str) + else on + ) + + n_rows = len(self) + + return self._from_native_frame( + pa.concat_tables( + [ + pa.Table.from_arrays( + [ + *[native_frame.column(idx_col) for idx_col in index_], + pa.array([on_col] * n_rows, pa.string()), + native_frame.column(on_col), + ], + names=[*index_, variable_name, value_name], + ) + for on_col in on_ + ] + ) + ) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index a07db4cec..fd40e876a 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -361,3 +361,19 @@ def gather_every(self: Self, n: int, offset: int) -> Self: ) .drop([row_index_token], strict=False) ) + + def unpivot( + self: Self, + on: str | list[str] | None, + index: str | list[str] | None, + variable_name: str | None, + value_name: str | None, + ) -> Self: + return self._from_native_frame( + self._native_frame.melt( + id_vars=index, + value_vars=on, + var_name=variable_name if variable_name is not None else "variable", + value_name=value_name if value_name is not None else "value", + ) + ) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index eb33b4fdf..b77d169d3 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -738,3 +738,19 @@ def sample( n=n, frac=fraction, replace=with_replacement, random_state=seed ) ) + + def unpivot( + self: Self, + on: str | list[str] | None, + index: str | list[str] | None, + variable_name: str | None, + value_name: str | None, + ) -> Self: + return self._from_native_frame( + self._native_frame.melt( + id_vars=index, + value_vars=on, + var_name=variable_name if variable_name is not None else "variable", + value_name=value_name if value_name is not None else "value", + ) + ) diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 1947a5aee..0a6358813 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -193,6 +193,28 @@ def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 return self._from_native_frame(self._native_frame.drop(to_drop)) return self._from_native_frame(self._native_frame.drop(columns, strict=strict)) + def unpivot( + self: Self, + on: str | list[str] | None, + index: str | list[str] | None, + variable_name: str | None, + value_name: str | None, + ) -> Self: + if self._backend_version < (1, 0, 0): # pragma: no cover + return self._from_native_frame( + self._native_frame.melt( + id_vars=index, + value_vars=on, + variable_name=variable_name, + value_name=value_name, + ) + ) + return self._from_native_frame( + self._native_frame.unpivot( + on=on, index=index, variable_name=variable_name, value_name=value_name + ) + ) + class PolarsLazyFrame: def __init__(self, df: Any, *, backend_version: tuple[int, ...]) -> None: @@ -263,3 +285,25 @@ def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 if self._backend_version < (1, 0, 0): # pragma: no cover return self._from_native_frame(self._native_frame.drop(columns)) return self._from_native_frame(self._native_frame.drop(columns, strict=strict)) + + def unpivot( + self: Self, + on: str | list[str] | None, + index: str | list[str] | None, + variable_name: str | None, + value_name: str | None, + ) -> Self: + if self._backend_version < (1, 0, 0): # pragma: no cover + return self._from_native_frame( + self._native_frame.melt( + id_vars=index, + value_vars=on, + variable_name=variable_name, + value_name=value_name, + ) + ) + return self._from_native_frame( + self._native_frame.unpivot( + on=on, index=index, variable_name=variable_name, value_name=value_name + ) + ) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 3cdd972e2..86e157322 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -290,6 +290,23 @@ def join_asof( ) ) + def unpivot( + self: Self, + on: str | list[str] | None, + *, + index: str | list[str] | None, + variable_name: str | None, + value_name: str | None, + ) -> Self: + return self._from_compliant_dataframe( + self._compliant_frame.unpivot( + on=on, + index=index, + variable_name=variable_name, + value_name=value_name, + ) + ) + class DataFrame(BaseFrame[FrameT]): """ @@ -2617,6 +2634,93 @@ def sample( ) ) + def unpivot( + self: Self, + on: str | list[str] | None = None, + *, + index: str | list[str] | None = None, + variable_name: str | None = None, + value_name: str | None = None, + ) -> Self: + r""" + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (index) while all other columns, considered + measured variables (on), are "unpivoted" to the row axis leaving just + two non-identifier columns, 'variable' and 'value'. + + Arguments: + on: Column(s) to use as values variables; if `on` is empty all columns that + are not in `index` will be used. + index: Column(s) to use as identifier variables. + variable_name: Name to give to the `variable` column. Defaults to "variable". + value_name: Name to give to the `value` column. Defaults to "value". + + Notes: + If you're coming from pandas, this is similar to `pandas.DataFrame.melt`, + but with `index` replacing `id_vars` and `on` replacing `value_vars`. + In other frameworks, you might know this operation as `pivot_longer`. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + + We define a library agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... return df.unpivot(on=["b", "c"], index="a") + + We can pass any supported library such as pandas, Polars or PyArrow to `func`: + + >>> func(pl.DataFrame(data)) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + >>> func(pd.DataFrame(data)) + a variable value + 0 x b 1 + 1 y b 3 + 2 z b 5 + 3 x c 2 + 4 y c 4 + 5 z c 6 + + >>> func(pa.table(data)) + pyarrow.Table + a: string + variable: string + value: int64 + ---- + a: [["x","y","z"],["x","y","z"]] + variable: [["b","b","b"],["c","c","c"]] + value: [[1,3,5],[2,4,6]] + """ + return super().unpivot( + on=on, index=index, variable_name=variable_name, value_name=value_name + ) + class LazyFrame(BaseFrame[FrameT]): """ @@ -4160,3 +4264,69 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: └─────┴─────┘ """ return super().gather_every(n=n, offset=offset) + + def unpivot( + self: Self, + on: str | list[str] | None = None, + *, + index: str | list[str] | None = None, + variable_name: str | None = None, + value_name: str | None = None, + ) -> Self: + r""" + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (index) while all other columns, considered + measured variables (on), are "unpivoted" to the row axis leaving just + two non-identifier columns, 'variable' and 'value'. + + Arguments: + on: Column(s) to use as values variables; if `on` is empty all columns that + are not in `index` will be used. + index: Column(s) to use as identifier variables. + variable_name: Name to give to the `variable` column. Defaults to "variable". + value_name: Name to give to the `value` column. Defaults to "value". + + Notes: + If you're coming from pandas, this is similar to `pandas.DataFrame.melt`, + but with `index` replacing `id_vars` and `on` replacing `value_vars`. + In other frameworks, you might know this operation as `pivot_longer`. + + Examples: + >>> import narwhals as nw + >>> import polars as pl + >>> data = { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + + We define a library agnostic function: + + >>> @nw.narwhalify + ... def func(lf): + ... return ( + ... lf.unpivot(on=["b", "c"], index="a").sort(["variable", "a"]).collect() + ... ) + + >>> func(pl.LazyFrame(data)) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + """ + return super().unpivot( + on=on, index=index, variable_name=variable_name, value_name=value_name + ) diff --git a/tests/frame/unpivot_test.py b/tests/frame/unpivot_test.py new file mode 100644 index 000000000..cccf38035 --- /dev/null +++ b/tests/frame/unpivot_test.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import Constructor +from tests.utils import compare_dicts + +data = { + "a": ["x", "y", "z"], + "b": [1, 3, 5], + "c": [2, 4, 6], +} + +expected_b_only = { + "a": ["x", "y", "z"], + "variable": ["b", "b", "b"], + "value": [1, 3, 5], +} + +expected_b_c = { + "a": ["x", "y", "z", "x", "y", "z"], + "variable": ["b", "b", "b", "c", "c", "c"], + "value": [1, 3, 5, 2, 4, 6], +} + + +@pytest.mark.parametrize( + ("on", "expected"), + [("b", expected_b_only), (["b", "c"], expected_b_c), (None, expected_b_c)], +) +def test_unpivot_on( + constructor: Constructor, + on: str | list[str] | None, + expected: dict[str, list[float]], +) -> None: + df = nw.from_native(constructor(data)) + result = df.unpivot(on=on, index=["a"]).sort("variable", "a") + compare_dicts(result, expected) + + +@pytest.mark.parametrize( + ("variable_name", "value_name"), + [ + ("", "custom_value_name"), + ("custom_variable_name", ""), + ("custom_variable_name", "custom_value_name"), + ], +) +def test_unpivot_var_value_names( + constructor: Constructor, + variable_name: str | None, + value_name: str | None, +) -> None: + df = nw.from_native(constructor(data)) + result = df.unpivot( + on=["b", "c"], index=["a"], variable_name=variable_name, value_name=value_name + ) + + assert result.collect_schema().names()[-2:] == [variable_name, value_name] + + +def test_unpivot_default_var_value_names(constructor: Constructor) -> None: + df = nw.from_native(constructor(data)) + result = df.unpivot(on=["b", "c"], index=["a"]) + + assert result.collect_schema().names()[-2:] == ["variable", "value"] From 5e49378721545c619b02a3fdc6aeb1bc56c426bd Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 27 Sep 2024 17:38:43 +0200 Subject: [PATCH 056/145] feat: Initial support for nested dtypes (List, Array, Struct) (#1083) --- docs/api-reference/dtypes.md | 3 ++ narwhals/__init__.py | 6 +++ narwhals/_arrow/namespace.py | 3 ++ narwhals/_arrow/utils.py | 15 +++++++ narwhals/_dask/expr.py | 4 +- narwhals/_dask/namespace.py | 7 ++- narwhals/_dask/utils.py | 11 ++++- narwhals/_duckdb/dataframe.py | 8 ++++ narwhals/_ibis/dataframe.py | 4 ++ narwhals/_pandas_like/namespace.py | 3 ++ narwhals/_pandas_like/utils.py | 15 +++++++ narwhals/_polars/namespace.py | 3 ++ narwhals/_polars/utils.py | 15 +++++++ narwhals/dtypes.py | 9 ++++ narwhals/stable/v1.py | 6 +++ tests/frame/schema_test.py | 70 +++++++++++++++++++++++++++--- 16 files changed, 172 insertions(+), 10 deletions(-) diff --git a/docs/api-reference/dtypes.md b/docs/api-reference/dtypes.md index c21b5c766..eb96608a6 100644 --- a/docs/api-reference/dtypes.md +++ b/docs/api-reference/dtypes.md @@ -4,6 +4,9 @@ handler: python options: members: + - Array + - List + - Struct - Int64 - Int32 - Int16 diff --git a/narwhals/__init__.py b/narwhals/__init__.py index a5f95cf70..8b8529c06 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -3,6 +3,7 @@ from narwhals import stable from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame +from narwhals.dtypes import Array from narwhals.dtypes import Boolean from narwhals.dtypes import Categorical from narwhals.dtypes import Date @@ -15,8 +16,10 @@ from narwhals.dtypes import Int16 from narwhals.dtypes import Int32 from narwhals.dtypes import Int64 +from narwhals.dtypes import List from narwhals.dtypes import Object from narwhals.dtypes import String +from narwhals.dtypes import Struct from narwhals.dtypes import UInt8 from narwhals.dtypes import UInt16 from narwhals.dtypes import UInt32 @@ -107,6 +110,9 @@ "String", "Datetime", "Duration", + "Struct", + "Array", + "List", "Date", "narwhalify", "show_versions", diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index 5a608f12e..6f699fa2a 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -44,6 +44,9 @@ class ArrowNamespace: Datetime = dtypes.Datetime Duration = dtypes.Duration Date = dtypes.Date + List = dtypes.List + Struct = dtypes.Struct + Array = dtypes.Array def _create_expr_from_callable( self, diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index ddf7a8639..7d6844e57 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -54,6 +54,12 @@ def translate_dtype(dtype: Any) -> dtypes.DType: return dtypes.Duration() if pa.types.is_dictionary(dtype): return dtypes.Categorical() + if pa.types.is_struct(dtype): + return dtypes.Struct() + if pa.types.is_list(dtype) or pa.types.is_large_list(dtype): + return dtypes.List() + if pa.types.is_fixed_size_list(dtype): + return dtypes.Array() return dtypes.Unknown() # pragma: no cover @@ -96,6 +102,15 @@ def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: return pa.duration("us") if isinstance_or_issubclass(dtype, dtypes.Date): return pa.date32() + if isinstance_or_issubclass(dtype, dtypes.List): # pragma: no cover + msg = "Converting to List dtype is not supported yet" + return NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Struct): # pragma: no cover + msg = "Converting to Struct dtype is not supported yet" + return NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Array): # pragma: no cover + msg = "Converting to Array dtype is not supported yet" + return NotImplementedError(msg) msg = f"Unknown dtype: {dtype}" # pragma: no cover raise AssertionError(msg) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index a6eb17566..eda0fd589 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -9,7 +9,7 @@ from narwhals._dask.utils import add_row_index from narwhals._dask.utils import maybe_evaluate -from narwhals._dask.utils import reverse_translate_dtype +from narwhals._dask.utils import narwhals_to_native_dtype from narwhals.utils import generate_unique_token if TYPE_CHECKING: @@ -700,7 +700,7 @@ def cast( dtype: DType | type[DType], ) -> Self: def func(_input: Any, dtype: DType | type[DType]) -> Any: - dtype = reverse_translate_dtype(dtype) + dtype = narwhals_to_native_dtype(dtype) return _input.astype(dtype) return self._from_call( diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index a683b39b1..7d661f063 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -13,7 +13,7 @@ from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr from narwhals._dask.selectors import DaskSelectorNamespace -from narwhals._dask.utils import reverse_translate_dtype +from narwhals._dask.utils import narwhals_to_native_dtype from narwhals._dask.utils import validate_comparand from narwhals._expression_parsing import combine_root_names from narwhals._expression_parsing import parse_into_exprs @@ -45,6 +45,9 @@ class DaskNamespace: Datetime = dtypes.Datetime Duration = dtypes.Duration Date = dtypes.Date + List = dtypes.List + Struct = dtypes.Struct + Array = dtypes.Array @property def selectors(self) -> DaskSelectorNamespace: @@ -83,7 +86,7 @@ def lit(self, value: Any, dtype: dtypes.DType | None) -> DaskExpr: def convert_if_dtype( series: dask_expr.Series, dtype: DType | type[DType] ) -> dask_expr.Series: - return series.astype(reverse_translate_dtype(dtype)) if dtype else series + return series.astype(narwhals_to_native_dtype(dtype)) if dtype else series return DaskExpr( lambda df: [ diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index 274044979..02dedab4e 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -83,7 +83,7 @@ def validate_comparand(lhs: dask_expr.Series, rhs: dask_expr.Series) -> None: raise RuntimeError(msg) -def reverse_translate_dtype(dtype: DType | type[DType]) -> Any: +def narwhals_to_native_dtype(dtype: DType | type[DType]) -> Any: from narwhals import dtypes if isinstance_or_issubclass(dtype, dtypes.Float64): @@ -122,6 +122,15 @@ def reverse_translate_dtype(dtype: DType | type[DType]) -> Any: return "datetime64[us]" if isinstance_or_issubclass(dtype, dtypes.Duration): return "timedelta64[ns]" + if isinstance_or_issubclass(dtype, dtypes.List): # pragma: no cover + msg = "Converting to List dtype is not supported yet" + return NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Struct): # pragma: no cover + msg = "Converting to Struct dtype is not supported yet" + return NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Array): # pragma: no cover + msg = "Converting to Array dtype is not supported yet" + return NotImplementedError(msg) msg = f"Unknown dtype: {dtype}" # pragma: no cover raise AssertionError(msg) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 2263c3fc7..099a91b72 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from typing import TYPE_CHECKING from typing import Any @@ -17,6 +18,7 @@ def map_duckdb_dtype_to_narwhals_dtype( duckdb_dtype: Any, ) -> dtypes.DType: + duckdb_dtype = str(duckdb_dtype) if duckdb_dtype == "BIGINT": return dtypes.Int64() if duckdb_dtype == "INTEGER": @@ -47,6 +49,12 @@ def map_duckdb_dtype_to_narwhals_dtype( return dtypes.Boolean() if duckdb_dtype == "INTERVAL": return dtypes.Duration() + if duckdb_dtype.startswith("STRUCT"): + return dtypes.Struct() + if re.match(r"\w+\[\]", duckdb_dtype): + return dtypes.List() + if re.match(r"\w+\[\d+\]", duckdb_dtype): + return dtypes.Array() return dtypes.Unknown() diff --git a/narwhals/_ibis/dataframe.py b/narwhals/_ibis/dataframe.py index fb7bedbf1..f0dc8f6eb 100644 --- a/narwhals/_ibis/dataframe.py +++ b/narwhals/_ibis/dataframe.py @@ -44,6 +44,10 @@ def map_ibis_dtype_to_narwhals_dtype( return dtypes.Date() if ibis_dtype.is_timestamp(): return dtypes.Datetime() + if ibis_dtype.is_array(): + return dtypes.List() + if ibis_dtype.is_struct(): + return dtypes.Struct() return dtypes.Unknown() # pragma: no cover diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index b60d0dcce..357ef80ab 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -44,6 +44,9 @@ class PandasLikeNamespace: Datetime = dtypes.Datetime Duration = dtypes.Duration Date = dtypes.Date + List = dtypes.List + Struct = dtypes.Struct + Array = dtypes.Array @property def selectors(self) -> PandasSelectorNamespace: diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index aadb438e2..5745ffd8a 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -256,6 +256,12 @@ def translate_dtype(column: Any) -> DType: return dtypes.Duration() if dtype == "date32[day][pyarrow]": return dtypes.Date() + if dtype.startswith(("large_list", "list")): + return dtypes.List() + if dtype.startswith("fixed_size_list"): + return dtypes.Array() + if dtype.startswith("struct"): + return dtypes.Struct() if dtype == "object": if ( # pragma: no cover TODO(unassigned): why does this show as uncovered? idx := getattr(column, "first_valid_index", lambda: None)() @@ -423,6 +429,15 @@ def narwhals_to_native_dtype( # noqa: PLR0915 if isinstance_or_issubclass(dtype, dtypes.Enum): msg = "Converting to Enum is not (yet) supported" raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.List): # pragma: no cover + msg = "Converting to List dtype is not supported yet" + return NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Struct): # pragma: no cover + msg = "Converting to Struct dtype is not supported yet" + return NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Array): # pragma: no cover + msg = "Converting to Array dtype is not supported yet" + return NotImplementedError(msg) msg = f"Unknown dtype: {dtype}" # pragma: no cover raise AssertionError(msg) diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index f25e8b81f..275c104fc 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -40,6 +40,9 @@ class PolarsNamespace: Datetime = dtypes.Datetime Duration = dtypes.Duration Date = dtypes.Date + List = dtypes.List + Struct = dtypes.Struct + Array = dtypes.Array def __init__(self, *, backend_version: tuple[int, ...]) -> None: self._backend_version = backend_version diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index e6fb36859..4a9809fc4 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -65,6 +65,12 @@ def translate_dtype(dtype: Any) -> dtypes.DType: return dtypes.Duration() if dtype == pl.Date: return dtypes.Date() + if dtype == pl.Struct: + return dtypes.Struct() + if dtype == pl.List: + return dtypes.List() + if dtype == pl.Array: + return dtypes.Array() return dtypes.Unknown() @@ -110,6 +116,15 @@ def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: return pl.Duration() if dtype == dtypes.Date: return pl.Date() + if dtype == dtypes.List: # pragma: no cover + msg = "Converting to List dtype is not supported yet" + return NotImplementedError(msg) + if dtype == dtypes.Struct: # pragma: no cover + msg = "Converting to Struct dtype is not supported yet" + return NotImplementedError(msg) + if dtype == dtypes.Array: # pragma: no cover + msg = "Converting to Array dtype is not supported yet" + return NotImplementedError(msg) return pl.Unknown() # pragma: no cover diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index 4d8da4293..2d5de0f16 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -83,4 +83,13 @@ class Categorical(DType): ... class Enum(DType): ... +class Struct(DType): ... + + +class List(DType): ... + + +class Array(DType): ... + + class Date(TemporalType): ... diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index cc0a42bed..b54203ca2 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -14,6 +14,7 @@ from narwhals import selectors from narwhals.dataframe import DataFrame as NwDataFrame from narwhals.dataframe import LazyFrame as NwLazyFrame +from narwhals.dtypes import Array from narwhals.dtypes import Boolean from narwhals.dtypes import Categorical from narwhals.dtypes import Date @@ -26,8 +27,10 @@ from narwhals.dtypes import Int16 from narwhals.dtypes import Int32 from narwhals.dtypes import Int64 +from narwhals.dtypes import List from narwhals.dtypes import Object from narwhals.dtypes import String +from narwhals.dtypes import Struct from narwhals.dtypes import UInt8 from narwhals.dtypes import UInt16 from narwhals.dtypes import UInt32 @@ -1970,6 +1973,9 @@ def from_dict( "String", "Datetime", "Duration", + "Struct", + "Array", + "List", "Date", "narwhalify", "show_versions", diff --git a/tests/frame/schema_test.py b/tests/frame/schema_test.py index d7ba69ab2..3aa341e0f 100644 --- a/tests/frame/schema_test.py +++ b/tests/frame/schema_test.py @@ -4,6 +4,7 @@ from datetime import timezone from typing import Any +import duckdb import pandas as pd import polars as pl import pytest @@ -95,6 +96,8 @@ def test_dtypes() -> None: "p": ["a"], "q": [timedelta(1)], "r": ["a"], + "s": [[1, 2]], + "u": [{"a": 1}], }, schema={ "a": pl.Int64, @@ -115,6 +118,8 @@ def test_dtypes() -> None: "p": pl.Categorical, "q": pl.Duration, "r": pl.Enum(["a", "b"]), + "s": pl.List(pl.Int64), + "u": pl.Struct({"a": pl.Int64}), }, ) df_from_pl = nw.from_native(df_pl, eager_only=True) @@ -137,6 +142,8 @@ def test_dtypes() -> None: "p": nw.Categorical, "q": nw.Duration, "r": nw.Enum, + "s": nw.List, + "u": nw.Struct, } assert df_from_pl.schema == df_from_pl.collect_schema() @@ -164,11 +171,6 @@ def test_unknown_dtype() -> None: assert nw.from_native(df).schema == {"a": nw.Unknown} -def test_unknown_dtype_polars() -> None: - df = pl.DataFrame({"a": [[1, 2, 3]]}) - assert nw.from_native(df).schema == {"a": nw.Unknown} - - def test_hash() -> None: assert nw.Int64() in {nw.Int64, nw.Int32} @@ -199,3 +201,61 @@ def test_from_non_hashable_column_name() -> None: df = nw.from_native(df, eager_only=True) assert df.columns == ["pizza", ["a", "b"]] assert df["pizza"].dtype == nw.Int64 + + +@pytest.mark.skipif( + parse_version(pd.__version__) < parse_version("2.2.0"), + reason="too old for pyarrow types", +) +def test_nested_dtypes() -> None: + df = pl.DataFrame( + {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]}, + schema_overrides={"b": pl.Array(pl.Int64, 2)}, + ).to_pandas(use_pyarrow_extension_array=True) + nwdf = nw.from_native(df) + assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} + df = pl.DataFrame( + {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]}, + schema_overrides={"b": pl.Array(pl.Int64, 2)}, + ) + nwdf = nw.from_native(df) + assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} + df = pl.DataFrame( + {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]}, + schema_overrides={"b": pl.Array(pl.Int64, 2)}, + ).to_arrow() + nwdf = nw.from_native(df) + assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} + df = duckdb.sql("select * from df") + nwdf = nw.from_native(df) + assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} + + +def test_nested_dtypes_ibis() -> None: # pragma: no cover + ibis = pytest.importorskip("ibis") + df = pl.DataFrame( + {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]}, + schema_overrides={"b": pl.Array(pl.Int64, 2)}, + ) + tbl = ibis.memtable(df[["a", "c"]]) + nwdf = nw.from_native(tbl) + assert nwdf.schema == {"a": nw.List, "c": nw.Struct} + + +@pytest.mark.skipif( + parse_version(pd.__version__) < parse_version("2.2.0"), + reason="too old for pyarrow types", +) +def test_nested_dtypes_dask() -> None: + pytest.importorskip("dask") + pytest.importorskip("dask_expr", exc_type=ImportError) + import dask.dataframe as dd + + df = dd.from_pandas( + pl.DataFrame( + {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]}, + schema_overrides={"b": pl.Array(pl.Int64, 2)}, + ).to_pandas(use_pyarrow_extension_array=True) + ) + nwdf = nw.from_native(df) + assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} From 76c8e8eaedfb69d26ade543c986f2dcb551e8bdd Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 27 Sep 2024 21:40:41 +0200 Subject: [PATCH 057/145] release: Bump version to 1.8.4 (#1084) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 28a07d5b0..505bec639 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.8.3' +'1.8.4' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 8b8529c06..2b571f0e2 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -57,7 +57,7 @@ from narwhals.utils import maybe_get_index from narwhals.utils import maybe_set_index -__version__ = "1.8.3" +__version__ = "1.8.4" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index 13128e4f8..e5fcf1abc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.8.3" +version = "1.8.4" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From f3bc18fdd93565cef04809d694841a0437533ecc Mon Sep 17 00:00:00 2001 From: Vahideh Alizadeh <82591913+V-Alizade@users.noreply.github.com> Date: Sat, 28 Sep 2024 14:06:29 +0200 Subject: [PATCH 058/145] docs: add pyarrow to dataframe lazy docstring (#1055) * add pyarrow to dataframe lazy docstring * update:add pyarrow to dataframe lazy docstring --- narwhals/dataframe.py | 15 +++++++++++++-- narwhals/stable/v1.py | 15 +++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 86e157322..24c147ddb 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -382,14 +382,16 @@ def lazy(self) -> LazyFrame[Any]: If a library does not support lazy execution, then this is a no-op. Examples: - Construct pandas and Polars DataFrames: + Construct pandas, Polars and PyArrow DataFrames: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} >>> df_pd = pd.DataFrame(df) >>> df_pl = pl.DataFrame(df) + >>> df_pa = pa.table(df) We define a library agnostic function: @@ -397,7 +399,7 @@ def lazy(self) -> LazyFrame[Any]: ... def func(df): ... return df.lazy() - Note that then, pandas dataframe stay eager, but Polars DataFrame becomes a Polars LazyFrame: + Note that then, pandas and pyarrow dataframe stay eager, but Polars DataFrame becomes a Polars LazyFrame: >>> func(df_pd) foo bar ham @@ -406,6 +408,15 @@ def lazy(self) -> LazyFrame[Any]: 2 3 8.0 c >>> func(df_pl) + >>> func(df_pa) + pyarrow.Table + foo: int64 + bar: double + ham: string + ---- + foo: [[1,2,3]] + bar: [[6,7,8]] + ham: [["a","b","c"]] """ return super().lazy() diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index b54203ca2..d170c4ebc 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -119,14 +119,16 @@ def lazy(self) -> LazyFrame[Any]: If a library does not support lazy execution, then this is a no-op. Examples: - Construct pandas and Polars DataFrames: + Construct pandas, Polars and PyArrow DataFrames: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals.stable.v1 as nw >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} >>> df_pd = pd.DataFrame(df) >>> df_pl = pl.DataFrame(df) + >>> df_pa = pa.table(df) We define a library agnostic function: @@ -134,7 +136,7 @@ def lazy(self) -> LazyFrame[Any]: ... def func(df): ... return df.lazy() - Note that then, pandas dataframe stay eager, but Polars DataFrame becomes a Polars LazyFrame: + Note that then, pandas and pyarrow dataframe stay eager, but Polars DataFrame becomes a Polars LazyFrame: >>> func(df_pd) foo bar ham @@ -143,6 +145,15 @@ def lazy(self) -> LazyFrame[Any]: 2 3 8.0 c >>> func(df_pl) + >>> func(df_pa) + pyarrow.Table + foo: int64 + bar: double + ham: string + ---- + foo: [[1,2,3]] + bar: [[6,7,8]] + ham: [["a","b","c"]] """ return _stableify(super().lazy()) # type: ignore[no-any-return] From 18392fdf041e366e871b6812e0185d3400eb44cd Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 28 Sep 2024 14:31:35 +0200 Subject: [PATCH 059/145] fix: make `narwhalify` part of stable api (#1086) --- narwhals/_dask/dataframe.py | 3 - narwhals/dataframe.py | 46 +++++---- narwhals/series.py | 18 ++-- narwhals/stable/{v1.py => v1/__init__.py} | 110 +++++++++++++++++----- tests/frame/lazy_test.py | 6 +- tests/stable_api_test.py | 29 ++++++ 6 files changed, 153 insertions(+), 59 deletions(-) rename narwhals/stable/{v1.py => v1/__init__.py} (94%) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index fd40e876a..ed001e105 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -98,9 +98,6 @@ def filter( mask = expr._call(self)[0] return self._from_native_frame(self._native_frame.loc[mask]) - def lazy(self) -> Self: - return self - def select( self: Self, *exprs: IntoDaskExpr, diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 24c147ddb..04a483703 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -107,12 +107,6 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: def columns(self) -> list[str]: return self._compliant_frame.columns # type: ignore[no-any-return] - def lazy(self) -> LazyFrame[Any]: - return LazyFrame( - self._compliant_frame.lazy(), - level=self._level, - ) - def with_columns( self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr ) -> Self: @@ -318,6 +312,16 @@ class DataFrame(BaseFrame[FrameT]): `narwhals.from_native`. """ + @property + def _series(self) -> type[Series]: + from narwhals.series import Series + + return Series + + @property + def _lazyframe(self) -> type[LazyFrame[Any]]: + return LazyFrame + def __init__( self, df: Any, @@ -418,7 +422,7 @@ def lazy(self) -> LazyFrame[Any]: bar: [[6,7,8]] ham: [["a","b","c"]] """ - return super().lazy() + return self._lazyframe(self._compliant_frame.lazy(), level=self._level) def to_native(self) -> FrameT: """ @@ -678,9 +682,7 @@ def get_column(self, name: str) -> Series: 2 ] """ - from narwhals.series import Series - - return Series( + return self._series( self._compliant_frame.get_column(name), level=self._level, ) @@ -830,9 +832,7 @@ def __getitem__( return self._from_compliant_dataframe(self._compliant_frame[item[0]]) return self._from_compliant_dataframe(self._compliant_frame[item]) if isinstance(item, str) or (isinstance(item, tuple) and len(item) == 2): - from narwhals.series import Series - - return Series( + return self._series( self._compliant_frame[item], level=self._level, ) @@ -898,11 +898,9 @@ def to_dict( >>> func(df_pa) {'A': [1, 2, 3, 4, 5], 'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'], 'B': [5, 4, 3, 2, 1], 'animals': ['beetle', 'fly', 'beetle', 'beetle', 'beetle'], 'optional': [28, 300, None, 2, -30]} """ - from narwhals.series import Series - if as_series: return { - key: Series( + key: self._series( value, level=self._level, ) @@ -2298,9 +2296,7 @@ def is_duplicated(self: Self) -> Series: true ] """ - from narwhals.series import Series - - return Series( + return self._series( self._compliant_frame.is_duplicated(), level=self._level, ) @@ -2381,9 +2377,7 @@ def is_unique(self: Self) -> Series: false ] """ - from narwhals.series import Series - - return Series( + return self._series( self._compliant_frame.is_unique(), level=self._level, ) @@ -2743,6 +2737,10 @@ class LazyFrame(BaseFrame[FrameT]): `narwhals.from_native`. """ + @property + def _dataframe(self) -> type[DataFrame[Any]]: + return DataFrame + def __init__( self, df: Any, @@ -2810,7 +2808,7 @@ def collect(self) -> DataFrame[Any]: │ c ┆ 6 ┆ 1 │ └─────┴─────┴─────┘ """ - return DataFrame( + return self._dataframe( self._compliant_frame.collect(), level=self._level, ) @@ -4233,7 +4231,7 @@ def lazy(self) -> Self: >>> func(df_pl) """ - return super().lazy() # type: ignore[return-value] + return self def gather_every(self: Self, n: int, offset: int = 0) -> Self: r""" diff --git a/narwhals/series.py b/narwhals/series.py index bd99fc5c7..5a84a9a5d 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -33,6 +33,12 @@ class Series: `series_only=True`. """ + @property + def _dataframe(self) -> type[DataFrame[Any]]: + from narwhals.dataframe import DataFrame + + return DataFrame + def __init__( self, series: Any, @@ -440,9 +446,7 @@ def to_frame(self) -> DataFrame[Any]: │ 3 │ └─────┘ """ - from narwhals.dataframe import DataFrame - - return DataFrame( + return self._dataframe( self._compliant_series.to_frame(), level=self._level, ) @@ -2021,9 +2025,7 @@ def value_counts( │ 3 ┆ 1 │ └─────┴───────┘ """ - from narwhals.dataframe import DataFrame - - return DataFrame( + return self._dataframe( self._compliant_series.value_counts( sort=sort, parallel=parallel, name=name, normalize=normalize ), @@ -2350,9 +2352,7 @@ def to_dummies( │ 0 ┆ 1 │ └─────┴─────┘ """ - from narwhals.dataframe import DataFrame - - return DataFrame( + return self._dataframe( self._compliant_series.to_dummies(separator=separator, drop_first=drop_first), level=self._level, ) diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1/__init__.py similarity index 94% rename from narwhals/stable/v1.py rename to narwhals/stable/v1/__init__.py index d170c4ebc..8c4bd877c 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import wraps from typing import TYPE_CHECKING from typing import Any from typing import Callable @@ -44,7 +45,6 @@ from narwhals.schema import Schema as NwSchema from narwhals.series import Series as NwSeries from narwhals.translate import get_native_namespace as nw_get_native_namespace -from narwhals.translate import narwhalify as nw_narwhalify from narwhals.translate import to_native from narwhals.typing import IntoDataFrameT from narwhals.typing import IntoFrameT @@ -75,6 +75,17 @@ class DataFrame(NwDataFrame[IntoDataFrameT]): `narwhals.from_native`. """ + # We need to override any method which don't return Self so that type + # annotations are correct. + + @property + def _series(self) -> type[Series]: + return Series + + @property + def _lazyframe(self) -> type[LazyFrame[Any]]: + return LazyFrame + @overload def __getitem__(self, item: tuple[Sequence[int], slice]) -> Self: ... @overload @@ -110,7 +121,7 @@ def __getitem__(self, item: slice) -> Self: ... def __getitem__(self, item: tuple[slice, slice]) -> Self: ... def __getitem__(self, item: Any) -> Any: - return _stableify(super().__getitem__(item)) + return super().__getitem__(item) def lazy(self) -> LazyFrame[Any]: """ @@ -155,7 +166,7 @@ def lazy(self) -> LazyFrame[Any]: bar: [[6,7,8]] ham: [["a","b","c"]] """ - return _stableify(super().lazy()) # type: ignore[no-any-return] + return super().lazy() # type: ignore[return-value] # Not sure what mypy is complaining about, probably some fancy # thing that I need to understand category theory for @@ -206,9 +217,7 @@ def to_dict( >>> func(df_pa) {'A': [1, 2, 3, 4, 5], 'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'], 'B': [5, 4, 3, 2, 1], 'animals': ['beetle', 'fly', 'beetle', 'beetle', 'beetle'], 'optional': [28, 300, None, 2, -30]} """ - if as_series: - return {key: _stableify(value) for key, value in super().to_dict().items()} - return super().to_dict(as_series=False) + return super().to_dict(as_series=as_series) # type: ignore[return-value] def is_duplicated(self: Self) -> Series: r""" @@ -256,7 +265,7 @@ def is_duplicated(self: Self) -> Series: true ] """ - return _stableify(super().is_duplicated()) + return super().is_duplicated() # type: ignore[return-value] def is_unique(self: Self) -> Series: r""" @@ -304,7 +313,11 @@ def is_unique(self: Self) -> Series: false ] """ - return _stableify(super().is_unique()) + return super().is_unique() # type: ignore[return-value] + + def _l1_norm(self: Self) -> Self: + """Private, just used to test the stable API.""" + return self.select(all()._l1_norm()) class LazyFrame(NwLazyFrame[IntoFrameT]): @@ -317,6 +330,10 @@ class LazyFrame(NwLazyFrame[IntoFrameT]): `narwhals.from_native`. """ + @property + def _dataframe(self) -> type[DataFrame[Any]]: + return DataFrame + def collect(self) -> DataFrame[Any]: r""" Materialize this LazyFrame into a DataFrame. @@ -353,7 +370,11 @@ def collect(self) -> DataFrame[Any]: │ c ┆ 6 ┆ 1 │ └─────┴─────┴─────┘ """ - return _stableify(super().collect()) # type: ignore[no-any-return] + return super().collect() # type: ignore[return-value] + + def _l1_norm(self: Self) -> Self: + """Private, just used to test the stable API.""" + return self.select(all()._l1_norm()) class Series(NwSeries): @@ -367,6 +388,13 @@ class Series(NwSeries): `series_only=True`. """ + # We need to override any method which don't return Self so that type + # annotations are correct. + + @property + def _dataframe(self) -> type[DataFrame[Any]]: + return DataFrame + def to_frame(self) -> DataFrame[Any]: """ Convert to dataframe. @@ -404,7 +432,7 @@ def to_frame(self) -> DataFrame[Any]: │ 3 │ └─────┘ """ - return _stableify(super().to_frame()) # type: ignore[no-any-return] + return super().to_frame() # type: ignore[return-value] def value_counts( self: Self, @@ -458,10 +486,8 @@ def value_counts( │ 3 ┆ 1 │ └─────┴───────┘ """ - return _stableify( # type: ignore[no-any-return] - super().value_counts( - sort=sort, parallel=parallel, name=name, normalize=normalize - ) + return super().value_counts( # type: ignore[return-value] + sort=sort, parallel=parallel, name=name, normalize=normalize ) @@ -865,14 +891,54 @@ def func(df): allow_series: Whether to allow series (default is only dataframe / lazyframe). """ - return nw_narwhalify( - func=func, - strict=strict, - eager_only=eager_only, - eager_or_interchange_only=eager_or_interchange_only, - series_only=series_only, - allow_series=allow_series, - ) + def decorator(func: Callable[..., Any]) -> Callable[..., Any]: + @wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + args = [ + from_native( + arg, + strict=strict, + eager_only=eager_only, + eager_or_interchange_only=eager_or_interchange_only, + series_only=series_only, + allow_series=allow_series, + ) + for arg in args + ] # type: ignore[assignment] + + kwargs = { + name: from_native( + value, + strict=strict, + eager_only=eager_only, + eager_or_interchange_only=eager_or_interchange_only, + series_only=series_only, + allow_series=allow_series, + ) + for name, value in kwargs.items() + } + + backends = { + b() + for v in (*args, *kwargs.values()) + if (b := getattr(v, "__native_namespace__", None)) + } + + if backends.__len__() > 1: + msg = "Found multiple backends. Make sure that all dataframe/series inputs come from the same backend." + raise ValueError(msg) + + result = func(*args, **kwargs) + + return to_native(result, strict=strict) + + return wrapper + + if func is None: + return decorator + else: + # If func is not None, it means the decorator is used without arguments + return decorator(func) def all() -> Expr: diff --git a/tests/frame/lazy_test.py b/tests/frame/lazy_test.py index 14b9d1488..09ca734c2 100644 --- a/tests/frame/lazy_test.py +++ b/tests/frame/lazy_test.py @@ -1,9 +1,13 @@ from typing import Any -import narwhals.stable.v1 as nw +import narwhals as nw +import narwhals.stable.v1 as nw_v1 def test_lazy(constructor_eager: Any) -> None: df = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True) result = df.lazy() assert isinstance(result, nw.LazyFrame) + df = nw_v1.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True) + result = df.lazy() + assert isinstance(result, nw_v1.LazyFrame) diff --git a/tests/stable_api_test.py b/tests/stable_api_test.py index 375af4f4d..d579b8185 100644 --- a/tests/stable_api_test.py +++ b/tests/stable_api_test.py @@ -1,3 +1,5 @@ +from typing import Any + import polars as pl import pytest @@ -36,6 +38,33 @@ def test_renamed_taxicab_norm(constructor: Constructor) -> None: compare_dicts(result, expected) +def test_renamed_taxicab_norm_dataframe(constructor: Constructor) -> None: + # Suppose we have `DataFrame._l1_norm` in `stable.v1`, but remove it + # in the main namespace. Here, we check that it's still usable from + # the stable api. + def func(df_any: Any) -> Any: + df = nw_v1.from_native(df_any) + df = df._l1_norm() + return df.to_native() + + result = nw_v1.from_native(func(constructor({"a": [1, 2, 3, -4, 5]}))) + expected = {"a": [15]} + compare_dicts(result, expected) + + +def test_renamed_taxicab_norm_dataframe_narwhalify(constructor: Constructor) -> None: + # Suppose we have `DataFrame._l1_norm` in `stable.v1`, but remove it + # in the main namespace. Here, we check that it's still usable from + # the stable api when using `narwhalify`. + @nw_v1.narwhalify + def func(df: Any) -> Any: + return df._l1_norm() + + result = nw_v1.from_native(func(constructor({"a": [1, 2, 3, -4, 5]}))) + expected = {"a": [15]} + compare_dicts(result, expected) + + def test_stable_api_completeness() -> None: v_1_api = nw_v1.__all__ main_namespace_api = nw.__all__ From bf55d8109f0a7e073572e405bfb2d48f960d0d00 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 28 Sep 2024 15:26:28 +0200 Subject: [PATCH 060/145] chore: rename translate_dtype to native_to_narwhals_dtype (#1089) --- narwhals/_arrow/dataframe.py | 4 ++-- narwhals/_arrow/series.py | 4 ++-- narwhals/_arrow/utils.py | 2 +- narwhals/_dask/dataframe.py | 4 ++-- narwhals/_pandas_like/dataframe.py | 4 ++-- narwhals/_pandas_like/series.py | 4 ++-- narwhals/_pandas_like/utils.py | 2 +- narwhals/_polars/dataframe.py | 10 +++++----- narwhals/_polars/series.py | 4 ++-- narwhals/_polars/utils.py | 2 +- 10 files changed, 20 insertions(+), 20 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 717238643..12cb16d2e 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -10,8 +10,8 @@ from narwhals._arrow.utils import broadcast_series from narwhals._arrow.utils import convert_str_slice_to_int_slice +from narwhals._arrow.utils import native_to_narwhals_dtype from narwhals._arrow.utils import select_rows -from narwhals._arrow.utils import translate_dtype from narwhals._arrow.utils import validate_dataframe_comparand from narwhals._expression_parsing import evaluate_into_exprs from narwhals.dependencies import is_numpy_array @@ -234,7 +234,7 @@ def __getitem__( def schema(self) -> dict[str, DType]: schema = self._native_frame.schema return { - name: translate_dtype(dtype) + name: native_to_narwhals_dtype(dtype) for name, dtype in zip(schema.names, schema.types) } diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index f3888bc53..a3c3e89e9 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -11,7 +11,7 @@ from narwhals._arrow.utils import cast_for_truediv from narwhals._arrow.utils import floordiv_compat from narwhals._arrow.utils import narwhals_to_native_dtype -from narwhals._arrow.utils import translate_dtype +from narwhals._arrow.utils import native_to_narwhals_dtype from narwhals._arrow.utils import validate_column_comparand from narwhals.utils import Implementation from narwhals.utils import generate_unique_token @@ -365,7 +365,7 @@ def alias(self, name: str) -> Self: @property def dtype(self: Self) -> DType: - return translate_dtype(self._native_series.type) + return native_to_narwhals_dtype(self._native_series.type) def abs(self) -> Self: import pyarrow.compute as pc # ignore-banned-import() diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 7d6844e57..90195a386 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -13,7 +13,7 @@ from narwhals._arrow.series import ArrowSeries -def translate_dtype(dtype: Any) -> dtypes.DType: +def native_to_narwhals_dtype(dtype: Any) -> dtypes.DType: import pyarrow as pa # ignore-banned-import if pa.types.is_int64(dtype): diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index ed001e105..9538d6d89 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -8,7 +8,7 @@ from narwhals._dask.utils import add_row_index from narwhals._dask.utils import parse_exprs_and_named_exprs -from narwhals._pandas_like.utils import translate_dtype +from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_unique_token @@ -140,7 +140,7 @@ def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: @property def schema(self) -> dict[str, DType]: return { - col: translate_dtype(self._native_frame.loc[:, col]) + col: native_to_narwhals_dtype(self._native_frame.loc[:, col]) for col in self._native_frame.columns } diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index b77d169d3..47a6bb39b 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -14,7 +14,7 @@ from narwhals._pandas_like.utils import convert_str_slice_to_int_slice from narwhals._pandas_like.utils import create_native_series from narwhals._pandas_like.utils import horizontal_concat -from narwhals._pandas_like.utils import translate_dtype +from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals._pandas_like.utils import validate_dataframe_comparand from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation @@ -265,7 +265,7 @@ def iter_rows( @property def schema(self) -> dict[str, DType]: return { - col: translate_dtype(self._native_frame.loc[:, col]) + col: native_to_narwhals_dtype(self._native_frame.loc[:, col]) for col in self._native_frame.columns } diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index d531b4641..8557f8eee 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -11,9 +11,9 @@ from narwhals._pandas_like.utils import int_dtype_mapper from narwhals._pandas_like.utils import narwhals_to_native_dtype from narwhals._pandas_like.utils import native_series_from_iterable +from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals._pandas_like.utils import set_axis from narwhals._pandas_like.utils import to_datetime -from narwhals._pandas_like.utils import translate_dtype from narwhals._pandas_like.utils import validate_column_comparand from narwhals.utils import Implementation @@ -167,7 +167,7 @@ def shape(self) -> tuple[int]: @property def dtype(self: Self) -> DType: - return translate_dtype(self._native_series) + return native_to_narwhals_dtype(self._native_series) def scatter(self, indices: int | Sequence[int], values: Any) -> Self: if isinstance(values, self.__class__): diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 5745ffd8a..286d712bf 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -206,7 +206,7 @@ def set_axis( return obj.set_axis(index, axis=0, **kwargs) # type: ignore[attr-defined, no-any-return] -def translate_dtype(column: Any) -> DType: +def native_to_narwhals_dtype(column: Any) -> DType: from narwhals import dtypes dtype = str(column.dtype) diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 0a6358813..acf70778b 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -6,7 +6,7 @@ from narwhals._polars.namespace import PolarsNamespace from narwhals._polars.utils import convert_str_slice_to_int_slice from narwhals._polars.utils import extract_args_kwargs -from narwhals._polars.utils import translate_dtype +from narwhals._polars.utils import native_to_narwhals_dtype from narwhals.utils import Implementation from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop @@ -78,14 +78,14 @@ def __array__(self, dtype: Any | None = None, copy: bool | None = None) -> np.nd @property def schema(self) -> dict[str, Any]: schema = self._native_frame.schema - return {name: translate_dtype(dtype) for name, dtype in schema.items()} + return {name: native_to_narwhals_dtype(dtype) for name, dtype in schema.items()} def collect_schema(self) -> dict[str, Any]: if self._backend_version < (1,): # pragma: no cover schema = self._native_frame.schema else: schema = dict(self._native_frame.collect_schema()) - return {name: translate_dtype(dtype) for name, dtype in schema.items()} + return {name: native_to_narwhals_dtype(dtype) for name, dtype in schema.items()} @property def shape(self) -> tuple[int, int]: @@ -257,14 +257,14 @@ def columns(self) -> list[str]: @property def schema(self) -> dict[str, Any]: schema = self._native_frame.schema - return {name: translate_dtype(dtype) for name, dtype in schema.items()} + return {name: native_to_narwhals_dtype(dtype) for name, dtype in schema.items()} def collect_schema(self) -> dict[str, Any]: if self._backend_version < (1,): # pragma: no cover schema = self._native_frame.schema else: schema = dict(self._native_frame.collect_schema()) - return {name: translate_dtype(dtype) for name, dtype in schema.items()} + return {name: native_to_narwhals_dtype(dtype) for name, dtype in schema.items()} def collect(self) -> PolarsDataFrame: return PolarsDataFrame( diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index e8fa28195..7f7bf94a2 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -19,7 +19,7 @@ from narwhals.dtypes import DType from narwhals._polars.utils import narwhals_to_native_dtype -from narwhals._polars.utils import translate_dtype +from narwhals._polars.utils import native_to_narwhals_dtype class PolarsSeries: @@ -81,7 +81,7 @@ def name(self) -> str: @property def dtype(self: Self) -> DType: - return translate_dtype(self._native_series.dtype) + return native_to_narwhals_dtype(self._native_series.dtype) @overload def __getitem__(self, item: int) -> Any: ... diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 4a9809fc4..45c464e51 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -26,7 +26,7 @@ def extract_args_kwargs(args: Any, kwargs: Any) -> tuple[list[Any], dict[str, An return args, kwargs -def translate_dtype(dtype: Any) -> dtypes.DType: +def native_to_narwhals_dtype(dtype: Any) -> dtypes.DType: import polars as pl # ignore-banned-import() if dtype == pl.Float64: From 3015eff66d7b3dc29c815f98f6bae89fa106b846 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sat, 28 Sep 2024 20:02:05 +0200 Subject: [PATCH 061/145] fix: `drop_nulls` with subset of length>1 (#1090) * fix: drop_nulls with subset * reduce_output_names --- narwhals/_arrow/namespace.py | 19 +++++++------------ narwhals/_dask/namespace.py | 19 +++++++------------ narwhals/_expression_parsing.py | 9 +++++++++ narwhals/_pandas_like/namespace.py | 19 +++++++------------ tests/frame/drop_nulls_test.py | 17 +++++++++++------ 5 files changed, 41 insertions(+), 42 deletions(-) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index 6f699fa2a..e1fb8c6b9 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -16,6 +16,7 @@ from narwhals._arrow.utils import vertical_concat from narwhals._expression_parsing import combine_root_names from narwhals._expression_parsing import parse_into_exprs +from narwhals._expression_parsing import reduce_output_names from narwhals.utils import Implementation if TYPE_CHECKING: @@ -183,9 +184,7 @@ def all_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) def func(df: ArrowDataFrame) -> list[ArrowSeries]: - series = [] - for _expr in parsed_exprs: - series.extend(list(_expr._call(df))) + series = (s for _expr in parsed_exprs for s in _expr._call(df)) return [reduce(lambda x, y: x & y, series)] return self._create_expr_from_callable( @@ -193,16 +192,14 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: depth=max(x._depth for x in parsed_exprs) + 1, function_name="all_horizontal", root_names=combine_root_names(parsed_exprs), - output_names=parsed_exprs[0]._output_names, + output_names=reduce_output_names(parsed_exprs), ) def any_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) def func(df: ArrowDataFrame) -> list[ArrowSeries]: - series = [] - for _expr in parsed_exprs: - series.extend(list(_expr._call(df))) + series = (s for _expr in parsed_exprs for s in _expr._call(df)) return [reduce(lambda x, y: x | y, series)] return self._create_expr_from_callable( @@ -210,16 +207,14 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: depth=max(x._depth for x in parsed_exprs) + 1, function_name="any_horizontal", root_names=combine_root_names(parsed_exprs), - output_names=parsed_exprs[0]._output_names, + output_names=reduce_output_names(parsed_exprs), ) def sum_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) def func(df: ArrowDataFrame) -> list[ArrowSeries]: - series = [] - for _expr in parsed_exprs: - series.extend([_series.fill_null(0) for _series in _expr._call(df)]) + series = (s.fill_null(0) for _expr in parsed_exprs for s in _expr._call(df)) return [reduce(lambda x, y: x + y, series)] return self._create_expr_from_callable( @@ -227,7 +222,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: depth=max(x._depth for x in parsed_exprs) + 1, function_name="sum_horizontal", root_names=combine_root_names(parsed_exprs), - output_names=parsed_exprs[0]._output_names, + output_names=reduce_output_names(parsed_exprs), ) def mean_horizontal(self, *exprs: IntoArrowExpr) -> IntoArrowExpr: diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 7d661f063..39c647192 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -17,6 +17,7 @@ from narwhals._dask.utils import validate_comparand from narwhals._expression_parsing import combine_root_names from narwhals._expression_parsing import parse_into_exprs +from narwhals._expression_parsing import reduce_output_names if TYPE_CHECKING: import dask_expr @@ -155,9 +156,7 @@ def all_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) def func(df: DaskLazyFrame) -> list[dask_expr.Series]: - series = [] - for _expr in parsed_exprs: - series.extend(list(_expr._call(df))) + series = [s for _expr in parsed_exprs for s in _expr._call(df)] return [reduce(lambda x, y: x & y, series).rename(series[0].name)] return DaskExpr( @@ -165,7 +164,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: depth=max(x._depth for x in parsed_exprs) + 1, function_name="all_horizontal", root_names=combine_root_names(parsed_exprs), - output_names=parsed_exprs[0]._output_names, + output_names=reduce_output_names(parsed_exprs), returns_scalar=False, backend_version=self._backend_version, ) @@ -174,9 +173,7 @@ def any_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) def func(df: DaskLazyFrame) -> list[dask_expr.Series]: - series = [] - for _expr in parsed_exprs: - series.extend(list(_expr._call(df))) + series = [s for _expr in parsed_exprs for s in _expr._call(df)] return [reduce(lambda x, y: x | y, series).rename(series[0].name)] return DaskExpr( @@ -184,7 +181,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: depth=max(x._depth for x in parsed_exprs) + 1, function_name="any_horizontal", root_names=combine_root_names(parsed_exprs), - output_names=parsed_exprs[0]._output_names, + output_names=reduce_output_names(parsed_exprs), returns_scalar=False, backend_version=self._backend_version, ) @@ -193,9 +190,7 @@ def sum_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) def func(df: DaskLazyFrame) -> list[dask_expr.Series]: - series = [] - for _expr in parsed_exprs: - series.extend([_series.fillna(0) for _series in _expr._call(df)]) + series = [s.fillna(0) for _expr in parsed_exprs for s in _expr._call(df)] return [reduce(lambda x, y: x + y, series).rename(series[0].name)] return DaskExpr( @@ -203,7 +198,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: depth=max(x._depth for x in parsed_exprs) + 1, function_name="sum_horizontal", root_names=combine_root_names(parsed_exprs), - output_names=parsed_exprs[0]._output_names, + output_names=reduce_output_names(parsed_exprs), returns_scalar=False, backend_version=self._backend_version, ) diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index 35d0f7ec1..d281cc945 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -319,3 +319,12 @@ def combine_root_names(parsed_exprs: Sequence[CompliantExpr]) -> list[str] | Non root_names = None break return root_names + + +def reduce_output_names(parsed_exprs: Sequence[CompliantExpr]) -> list[str] | None: + """Returns the left-most output name""" + return ( + parsed_exprs[0]._output_names[:1] + if parsed_exprs[0]._output_names is not None + else None + ) diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 357ef80ab..7356524d3 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -11,6 +11,7 @@ from narwhals import dtypes from narwhals._expression_parsing import combine_root_names from narwhals._expression_parsing import parse_into_exprs +from narwhals._expression_parsing import reduce_output_names from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.expr import PandasLikeExpr from narwhals._pandas_like.selectors import PandasSelectorNamespace @@ -218,9 +219,7 @@ def sum_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: - series = [] - for _expr in parsed_exprs: - series.extend([_series.fill_null(0) for _series in _expr._call(df)]) + series = (s.fill_null(0) for _expr in parsed_exprs for s in _expr._call(df)) return [reduce(lambda x, y: x + y, series)] return self._create_expr_from_callable( @@ -228,16 +227,14 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: depth=max(x._depth for x in parsed_exprs) + 1, function_name="sum_horizontal", root_names=combine_root_names(parsed_exprs), - output_names=parsed_exprs[0]._output_names, + output_names=reduce_output_names(parsed_exprs), ) def all_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: - series = [] - for _expr in parsed_exprs: - series.extend(list(_expr._call(df))) + series = (s for _expr in parsed_exprs for s in _expr._call(df)) return [reduce(lambda x, y: x & y, series)] return self._create_expr_from_callable( @@ -245,16 +242,14 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: depth=max(x._depth for x in parsed_exprs) + 1, function_name="all_horizontal", root_names=combine_root_names(parsed_exprs), - output_names=parsed_exprs[0]._output_names, + output_names=reduce_output_names(parsed_exprs), ) def any_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: - series = [] - for _expr in parsed_exprs: - series.extend(list(_expr._call(df))) + series = (s for _expr in parsed_exprs for s in _expr._call(df)) return [reduce(lambda x, y: x | y, series)] return self._create_expr_from_callable( @@ -262,7 +257,7 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: depth=max(x._depth for x in parsed_exprs) + 1, function_name="any_horizontal", root_names=combine_root_names(parsed_exprs), - output_names=parsed_exprs[0]._output_names, + output_names=reduce_output_names(parsed_exprs), ) def mean_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: diff --git a/tests/frame/drop_nulls_test.py b/tests/frame/drop_nulls_test.py index 4c2276030..9988aa6b2 100644 --- a/tests/frame/drop_nulls_test.py +++ b/tests/frame/drop_nulls_test.py @@ -21,11 +21,16 @@ def test_drop_nulls(constructor: Constructor) -> None: compare_dicts(result, expected) -@pytest.mark.parametrize("subset", ["a", ["a"]]) -def test_drop_nulls_subset(constructor: Constructor, subset: str | list[str]) -> None: +@pytest.mark.parametrize( + ("subset", "expected"), + [ + ("a", {"a": [1, 2.0, 4.0], "b": [float("nan"), 3.0, 5.0]}), + (["a"], {"a": [1, 2.0, 4.0], "b": [float("nan"), 3.0, 5.0]}), + (["a", "b"], {"a": [2.0, 4.0], "b": [3.0, 5.0]}), + ], +) +def test_drop_nulls_subset( + constructor: Constructor, subset: str | list[str], expected: dict[str, float] +) -> None: result = nw.from_native(constructor(data)).drop_nulls(subset=subset) - expected = { - "a": [1, 2.0, 4.0], - "b": [float("nan"), 3.0, 5.0], - } compare_dicts(result, expected) From 894fa3ac528a32116cf1c4860c3bce7c39297977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Sat, 28 Sep 2024 20:03:39 +0200 Subject: [PATCH 062/145] docs: Added new logo (#1085) * modified mkdocs to include logo * Add new logo --- docs/assets/logo.svg | 6 ++++++ docs/css/mkdocstrings.css | 9 +++++++++ mkdocs.yml | 7 +++++-- 3 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 docs/assets/logo.svg create mode 100644 docs/css/mkdocstrings.css diff --git a/docs/assets/logo.svg b/docs/assets/logo.svg new file mode 100644 index 000000000..56ecf5c55 --- /dev/null +++ b/docs/assets/logo.svg @@ -0,0 +1,6 @@ + + + \ No newline at end of file diff --git a/docs/css/mkdocstrings.css b/docs/css/mkdocstrings.css new file mode 100644 index 000000000..0951698ad --- /dev/null +++ b/docs/css/mkdocstrings.css @@ -0,0 +1,9 @@ +.md-header__topic { + font-size: 200%; + font-family: Verdana, Geneva, Tahoma, sans-serif; +} + +.md-header__button.md-logo img { + height: 22%; + width: 22%; + } \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 045a6a679..ccd4307ae 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -47,8 +47,8 @@ nav: theme: name: material font: false - favicon: assets/image.png - logo: assets/image.png + favicon: assets/logo.svg + logo: assets/logo.svg features: - content.code.copy - content.code.annotate @@ -75,6 +75,9 @@ theme: toggle: icon: material/brightness-4 name: Switch to system preference +extra_css: + - https://unpkg.com/katex@0/dist/katex.min.css + - css/mkdocstrings.css plugins: - search From 9c5af5f0269d4e35f6225df7a1c9bcbf10e35e68 Mon Sep 17 00:00:00 2001 From: Vahideh Alizadeh <82591913+V-Alizade@users.noreply.github.com> Date: Sun, 29 Sep 2024 10:10:46 +0200 Subject: [PATCH 063/145] ci: add name-tests-test to pre-commit (#1093) * add name-tests-test to pre-commit * ci: update name-tests-test and rename all necessary files --- .pre-commit-config.yaml | 5 ++ tests/frame/invalid_test.py | 43 +++++++++++++++++ tests/frame/test_invalid.py | 48 ------------------- tests/{test_group_by.py => group_by_test.py} | 0 ...arithmetic.py => basic_arithmetic_test.py} | 0 .../{test_concat.py => concat_test.py} | 0 .../hypothesis/{test_join.py => join_test.py} | 0 tests/{test_utils.py => utils_test.py} | 0 .../{test_queries.py => queries_test.py} | 0 9 files changed, 48 insertions(+), 48 deletions(-) delete mode 100644 tests/frame/test_invalid.py rename tests/{test_group_by.py => group_by_test.py} (100%) rename tests/hypothesis/{test_basic_arithmetic.py => basic_arithmetic_test.py} (100%) rename tests/hypothesis/{test_concat.py => concat_test.py} (100%) rename tests/hypothesis/{test_join.py => join_test.py} (100%) rename tests/{test_utils.py => utils_test.py} (100%) rename tpch/tests/{test_queries.py => queries_test.py} (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 08d274f86..a9f01f566 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,3 +46,8 @@ repos: args: [--skip-errors] additional_dependencies: - black==22.12.0 +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: name-tests-test + exclude: ^tests/utils\.py \ No newline at end of file diff --git a/tests/frame/invalid_test.py b/tests/frame/invalid_test.py index 41c780a7d..2fdf53949 100644 --- a/tests/frame/invalid_test.py +++ b/tests/frame/invalid_test.py @@ -1,9 +1,11 @@ +import numpy as np import pandas as pd import polars as pl import pyarrow as pa import pytest import narwhals.stable.v1 as nw +from narwhals.utils import parse_version def test_invalid() -> None: @@ -18,3 +20,44 @@ def test_invalid() -> None: df.select([pl.col("a")]) # type: ignore[list-item] with pytest.raises(TypeError, match="Perhaps you:"): df.select([nw.col("a").cast(pl.Int64)]) + + +def test_native_vs_non_native() -> None: + s = pd.Series([1, 2, 3]) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + with pytest.raises(TypeError, match="Perhaps you forgot"): + nw.from_native(df).filter(s > 1) + s = pl.Series([1, 2, 3]) + df = pl.DataFrame({"a": [2, 2, 3], "b": [4, 5, 6]}) + with pytest.raises(TypeError, match="Perhaps you\n- forgot"): + nw.from_native(df).filter(s > 1) + + +def test_validate_laziness() -> None: + df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + with pytest.raises( + NotImplementedError, + match=("The items to concatenate should either all be eager, or all lazy"), + ): + nw.concat([nw.from_native(df, eager_only=True), nw.from_native(df).lazy()]) # type: ignore[list-item] + + +@pytest.mark.skipif( + parse_version(np.__version__) < parse_version("1.26.4"), reason="too old" +) +def test_memmap() -> None: + pytest.importorskip("sklearn") + # the headache this caused me... + from sklearn.utils import check_X_y + from sklearn.utils._testing import create_memmap_backed_data + + x_any = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + y_any = create_memmap_backed_data(x_any["b"]) + + x_any, y_any = create_memmap_backed_data([x_any, y_any]) + + x = nw.from_native(x_any) + x = x.with_columns(y=nw.from_native(y_any, series_only=True)) + + # check this doesn't raise + check_X_y(nw.to_native(x), nw.to_native(x["y"])) diff --git a/tests/frame/test_invalid.py b/tests/frame/test_invalid.py deleted file mode 100644 index b8bca586f..000000000 --- a/tests/frame/test_invalid.py +++ /dev/null @@ -1,48 +0,0 @@ -import numpy as np -import pandas as pd -import polars as pl -import pytest - -import narwhals.stable.v1 as nw -from narwhals.utils import parse_version - - -def test_native_vs_non_native() -> None: - s = pd.Series([1, 2, 3]) - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - with pytest.raises(TypeError, match="Perhaps you forgot"): - nw.from_native(df).filter(s > 1) - s = pl.Series([1, 2, 3]) - df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - with pytest.raises(TypeError, match="Perhaps you\n- forgot"): - nw.from_native(df).filter(s > 1) - - -def test_validate_laziness() -> None: - df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - with pytest.raises( - NotImplementedError, - match=("The items to concatenate should either all be eager, or all lazy"), - ): - nw.concat([nw.from_native(df, eager_only=True), nw.from_native(df).lazy()]) # type: ignore[list-item] - - -@pytest.mark.skipif( - parse_version(np.__version__) < parse_version("1.26.4"), reason="too old" -) -def test_memmap() -> None: - pytest.importorskip("sklearn") - # the headache this caused me... - from sklearn.utils import check_X_y - from sklearn.utils._testing import create_memmap_backed_data - - x_any = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - y_any = create_memmap_backed_data(x_any["b"]) - - x_any, y_any = create_memmap_backed_data([x_any, y_any]) - - x = nw.from_native(x_any) - x = x.with_columns(y=nw.from_native(y_any, series_only=True)) - - # check this doesn't raise - check_X_y(nw.to_native(x), nw.to_native(x["y"])) diff --git a/tests/test_group_by.py b/tests/group_by_test.py similarity index 100% rename from tests/test_group_by.py rename to tests/group_by_test.py diff --git a/tests/hypothesis/test_basic_arithmetic.py b/tests/hypothesis/basic_arithmetic_test.py similarity index 100% rename from tests/hypothesis/test_basic_arithmetic.py rename to tests/hypothesis/basic_arithmetic_test.py diff --git a/tests/hypothesis/test_concat.py b/tests/hypothesis/concat_test.py similarity index 100% rename from tests/hypothesis/test_concat.py rename to tests/hypothesis/concat_test.py diff --git a/tests/hypothesis/test_join.py b/tests/hypothesis/join_test.py similarity index 100% rename from tests/hypothesis/test_join.py rename to tests/hypothesis/join_test.py diff --git a/tests/test_utils.py b/tests/utils_test.py similarity index 100% rename from tests/test_utils.py rename to tests/utils_test.py diff --git a/tpch/tests/test_queries.py b/tpch/tests/queries_test.py similarity index 100% rename from tpch/tests/test_queries.py rename to tpch/tests/queries_test.py From 2f1d74feb72a23a5a4723476df183da8ebc507c1 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Sun, 29 Sep 2024 05:56:15 -0400 Subject: [PATCH 064/145] test: `xfail` test using None on `to_numpy` for cuDF (#1092) * xfail to_numpy test with nulls * fix xfail --- tests/series_only/to_numpy_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/series_only/to_numpy_test.py b/tests/series_only/to_numpy_test.py index 433ede16a..2f1464a57 100644 --- a/tests/series_only/to_numpy_test.py +++ b/tests/series_only/to_numpy_test.py @@ -10,8 +10,10 @@ def test_to_numpy(constructor_eager: Any, request: pytest.FixtureRequest) -> None: - if "pandas_constructor" in str(constructor_eager) or "modin_constructor" in str( - constructor_eager + if ( + "pandas_constructor" in str(constructor_eager) + or "modin_constructor" in str(constructor_eager) + or "cudf_constructor" in str(constructor_eager) ): request.applymarker(pytest.mark.xfail) From e3d2a4b98ce9581d17bfac35f8166962847b1be3 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 29 Sep 2024 21:23:17 +0100 Subject: [PATCH 065/145] feat: add dtypes to stable api (#1087) --- docs/how_it_works.md | 5 ++ narwhals/_arrow/dataframe.py | 43 ++++++++++++--- narwhals/_arrow/expr.py | 27 ++++++++- narwhals/_arrow/namespace.py | 79 +++++++++++++------------- narwhals/_arrow/selectors.py | 44 +++++++++------ narwhals/_arrow/series.py | 26 ++++++--- narwhals/_arrow/utils.py | 9 ++- narwhals/_dask/dataframe.py | 19 +++++-- narwhals/_dask/expr.py | 20 ++++++- narwhals/_dask/group_by.py | 3 +- narwhals/_dask/namespace.py | 80 ++++++++++++--------------- narwhals/_dask/selectors.py | 44 +++++++++------ narwhals/_dask/utils.py | 5 +- narwhals/_duckdb/dataframe.py | 18 +++--- narwhals/_duckdb/series.py | 11 +++- narwhals/_ibis/dataframe.py | 15 +++-- narwhals/_ibis/series.py | 11 +++- narwhals/_interchange/dataframe.py | 19 ++++--- narwhals/_interchange/series.py | 18 +++--- narwhals/_pandas_like/dataframe.py | 19 ++++++- narwhals/_pandas_like/expr.py | 21 ++++++- narwhals/_pandas_like/group_by.py | 1 + narwhals/_pandas_like/namespace.py | 76 ++++++++++++++----------- narwhals/_pandas_like/selectors.py | 45 +++++++++------ narwhals/_pandas_like/series.py | 15 ++++- narwhals/_pandas_like/utils.py | 20 ++++--- narwhals/_polars/dataframe.py | 68 +++++++++++++++++------ narwhals/_polars/expr.py | 8 ++- narwhals/_polars/namespace.py | 89 +++++++++++++++--------------- narwhals/_polars/series.py | 26 ++++++--- narwhals/_polars/utils.py | 11 ++-- narwhals/functions.py | 57 +++++++++++++++++-- narwhals/stable/v1/__init__.py | 73 +++++++++++++++--------- narwhals/stable/v1/dtypes.py | 47 ++++++++++++++++ narwhals/translate.py | 71 ++++++++++++++++++++---- narwhals/typing.py | 27 +++++++++ narwhals/utils.py | 3 +- tests/from_dict_test.py | 22 ++++++-- tests/new_series_test.py | 22 +++++++- tests/stable_api_test.py | 8 +++ 40 files changed, 851 insertions(+), 374 deletions(-) create mode 100644 narwhals/stable/v1/dtypes.py diff --git a/docs/how_it_works.md b/docs/how_it_works.md index cda98a2b6..cc808cc6f 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -75,6 +75,7 @@ from narwhals.utils import parse_version pn = PandasLikeNamespace( implementation=Implementation.PANDAS, backend_version=parse_version(pd.__version__), + dtypes=nw.dtypes, ) print(nw.col("a")._call(pn)) ``` @@ -101,6 +102,7 @@ import pandas as pd pn = PandasLikeNamespace( implementation=Implementation.PANDAS, backend_version=parse_version(pd.__version__), + dtypes=nw.dtypes, ) df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -108,6 +110,7 @@ df = PandasLikeDataFrame( df_pd, implementation=Implementation.PANDAS, backend_version=parse_version(pd.__version__), + dtypes=nw.dtypes, ) expression = pn.col("a") + 1 result = expression._call(df) @@ -196,6 +199,7 @@ import pandas as pd pn = PandasLikeNamespace( implementation=Implementation.PANDAS, backend_version=parse_version(pd.__version__), + dtypes=nw.dtypes, ) df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -210,6 +214,7 @@ backend, and it does so by passing a Narwhals-compliant namespace to `nw.Expr._c pn = PandasLikeNamespace( implementation=Implementation.PANDAS, backend_version=parse_version(pd.__version__), + dtypes=nw.dtypes, ) expr = (nw.col("a") + 1)._call(pn) print(expr) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 12cb16d2e..efc343177 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -33,21 +33,27 @@ from narwhals._arrow.series import ArrowSeries from narwhals._arrow.typing import IntoArrowExpr from narwhals.dtypes import DType + from narwhals.typing import DTypes class ArrowDataFrame: # --- not in the spec --- def __init__( - self, native_dataframe: pa.Table, *, backend_version: tuple[int, ...] + self, + native_dataframe: pa.Table, + *, + backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._native_frame = native_dataframe self._implementation = Implementation.PYARROW self._backend_version = backend_version + self._dtypes = dtypes def __narwhals_namespace__(self) -> ArrowNamespace: from narwhals._arrow.namespace import ArrowNamespace - return ArrowNamespace(backend_version=self._backend_version) + return ArrowNamespace(backend_version=self._backend_version, dtypes=self._dtypes) def __native_namespace__(self: Self) -> ModuleType: if self._implementation is Implementation.PYARROW: @@ -63,7 +69,9 @@ def __narwhals_lazyframe__(self) -> Self: return self def _from_native_frame(self, df: Any) -> Self: - return self.__class__(df, backend_version=self._backend_version) + return self.__class__( + df, backend_version=self._backend_version, dtypes=self._dtypes + ) @property def shape(self) -> tuple[int, int]: @@ -111,6 +119,7 @@ def get_column(self, name: str) -> ArrowSeries: self._native_frame[name], name=name, backend_version=self._backend_version, + dtypes=self._dtypes, ) def __array__(self, dtype: Any = None, copy: bool | None = None) -> np.ndarray: @@ -151,6 +160,7 @@ def __getitem__( self._native_frame[item], name=item, backend_version=self._backend_version, + dtypes=self._dtypes, ) elif ( isinstance(item, tuple) @@ -191,12 +201,14 @@ def __getitem__( self._native_frame[col_name], name=col_name, backend_version=self._backend_version, + dtypes=self._dtypes, ) selected_rows = select_rows(self._native_frame, item[0]) return ArrowSeries( selected_rows[col_name], name=col_name, backend_version=self._backend_version, + dtypes=self._dtypes, ) elif isinstance(item, slice): @@ -234,7 +246,7 @@ def __getitem__( def schema(self) -> dict[str, DType]: schema = self._native_frame.schema return { - name: native_to_narwhals_dtype(dtype) + name: native_to_narwhals_dtype(dtype, self._dtypes) for name, dtype in zip(schema.names, schema.types) } @@ -410,7 +422,12 @@ def to_dict(self, *, as_series: bool) -> Any: from narwhals._arrow.series import ArrowSeries return { - name: ArrowSeries(col, name=name, backend_version=self._backend_version) + name: ArrowSeries( + col, + name=name, + backend_version=self._backend_version, + dtypes=self._dtypes, + ) for name, col in names_and_values } else: @@ -471,7 +488,9 @@ def lazy(self) -> Self: return self def collect(self) -> ArrowDataFrame: - return ArrowDataFrame(self._native_frame, backend_version=self._backend_version) + return ArrowDataFrame( + self._native_frame, backend_version=self._backend_version, dtypes=self._dtypes + ) def clone(self) -> Self: msg = "clone is not yet supported on PyArrow tables" @@ -541,7 +560,12 @@ def is_duplicated(self: Self) -> ArrowSeries: ).column(f"{col_token}_count"), 1, ) - return ArrowSeries(is_duplicated, name="", backend_version=self._backend_version) + return ArrowSeries( + is_duplicated, + name="", + backend_version=self._backend_version, + dtypes=self._dtypes, + ) def is_unique(self: Self) -> ArrowSeries: import pyarrow.compute as pc # ignore-banned-import() @@ -551,7 +575,10 @@ def is_unique(self: Self) -> ArrowSeries: is_duplicated = self.is_duplicated()._native_series return ArrowSeries( - pc.invert(is_duplicated), name="", backend_version=self._backend_version + pc.invert(is_duplicated), + name="", + backend_version=self._backend_version, + dtypes=self._dtypes, ) def unique( diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 367dc9b44..6d1001c11 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -17,6 +17,7 @@ from narwhals._arrow.series import ArrowSeries from narwhals._arrow.typing import IntoArrowExpr from narwhals.dtypes import DType + from narwhals.typing import DTypes class ArrowExpr: @@ -29,6 +30,7 @@ def __init__( root_names: list[str] | None, output_names: list[str] | None, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._call = call self._depth = depth @@ -38,6 +40,7 @@ def __init__( self._output_names = output_names self._implementation = Implementation.PYARROW self._backend_version = backend_version + self._dtypes = dtypes def __repr__(self) -> str: # pragma: no cover return ( @@ -50,7 +53,10 @@ def __repr__(self) -> str: # pragma: no cover @classmethod def from_column_names( - cls: type[Self], *column_names: str, backend_version: tuple[int, ...] + cls: type[Self], + *column_names: str, + backend_version: tuple[int, ...], + dtypes: DTypes, ) -> Self: from narwhals._arrow.series import ArrowSeries @@ -60,6 +66,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: df._native_frame[column_name], name=column_name, backend_version=df._backend_version, + dtypes=df._dtypes, ) for column_name in column_names ] @@ -71,11 +78,15 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: root_names=list(column_names), output_names=list(column_names), backend_version=backend_version, + dtypes=dtypes, ) @classmethod def from_column_indices( - cls: type[Self], *column_indices: int, backend_version: tuple[int, ...] + cls: type[Self], + *column_indices: int, + backend_version: tuple[int, ...], + dtypes: DTypes, ) -> Self: from narwhals._arrow.series import ArrowSeries @@ -85,6 +96,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: df._native_frame[column_index], name=df._native_frame.column_names[column_index], backend_version=df._backend_version, + dtypes=df._dtypes, ) for column_index in column_indices ] @@ -96,12 +108,13 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: root_names=None, output_names=None, backend_version=backend_version, + dtypes=dtypes, ) def __narwhals_namespace__(self) -> ArrowNamespace: from narwhals._arrow.namespace import ArrowNamespace - return ArrowNamespace(backend_version=self._backend_version) + return ArrowNamespace(backend_version=self._backend_version, dtypes=self._dtypes) def __narwhals_expr__(self) -> None: ... @@ -246,6 +259,7 @@ def alias(self, name: str) -> Self: root_names=self._root_names, output_names=[name], backend_version=self._backend_version, + dtypes=self._dtypes, ) def null_count(self) -> Self: @@ -352,6 +366,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: root_names=self._root_names, output_names=self._output_names, backend_version=self._backend_version, + dtypes=self._dtypes, ) def mode(self: Self) -> Self: @@ -573,6 +588,7 @@ def keep(self: Self) -> ArrowExpr: root_names=root_names, output_names=root_names, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def map(self: Self, function: Callable[[str], str]) -> ArrowExpr: @@ -598,6 +614,7 @@ def map(self: Self, function: Callable[[str], str]) -> ArrowExpr: root_names=root_names, output_names=output_names, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def prefix(self: Self, prefix: str) -> ArrowExpr: @@ -621,6 +638,7 @@ def prefix(self: Self, prefix: str) -> ArrowExpr: root_names=root_names, output_names=output_names, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def suffix(self: Self, suffix: str) -> ArrowExpr: @@ -645,6 +663,7 @@ def suffix(self: Self, suffix: str) -> ArrowExpr: root_names=root_names, output_names=output_names, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def to_lowercase(self: Self) -> ArrowExpr: @@ -669,6 +688,7 @@ def to_lowercase(self: Self) -> ArrowExpr: root_names=root_names, output_names=output_names, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def to_uppercase(self: Self) -> ArrowExpr: @@ -693,4 +713,5 @@ def to_uppercase(self: Self) -> ArrowExpr: root_names=root_names, output_names=output_names, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index e1fb8c6b9..3e7f4ecc9 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -7,7 +7,6 @@ from typing import Literal from typing import cast -from narwhals import dtypes from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.expr import ArrowExpr from narwhals._arrow.selectors import ArrowSelectorNamespace @@ -23,32 +22,11 @@ from typing import Callable from narwhals._arrow.typing import IntoArrowExpr + from narwhals.dtypes import DType + from narwhals.typing import DTypes class ArrowNamespace: - Int64 = dtypes.Int64 - Int32 = dtypes.Int32 - Int16 = dtypes.Int16 - Int8 = dtypes.Int8 - UInt64 = dtypes.UInt64 - UInt32 = dtypes.UInt32 - UInt16 = dtypes.UInt16 - UInt8 = dtypes.UInt8 - Float64 = dtypes.Float64 - Float32 = dtypes.Float32 - Boolean = dtypes.Boolean - Object = dtypes.Object - Unknown = dtypes.Unknown - Categorical = dtypes.Categorical - Enum = dtypes.Enum - String = dtypes.String - Datetime = dtypes.Datetime - Duration = dtypes.Duration - Date = dtypes.Date - List = dtypes.List - Struct = dtypes.Struct - Array = dtypes.Array - def _create_expr_from_callable( self, func: Callable[[ArrowDataFrame], list[ArrowSeries]], @@ -67,6 +45,7 @@ def _create_expr_from_callable( root_names=root_names, output_names=output_names, backend_version=self._backend_version, + dtypes=self._dtypes, ) def _create_expr_from_series(self, series: ArrowSeries) -> ArrowExpr: @@ -79,6 +58,7 @@ def _create_expr_from_series(self, series: ArrowSeries) -> ArrowExpr: root_names=None, output_names=None, backend_version=self._backend_version, + dtypes=self._dtypes, ) def _create_series_from_scalar(self, value: Any, series: ArrowSeries) -> ArrowSeries: @@ -90,6 +70,7 @@ def _create_series_from_scalar(self, value: Any, series: ArrowSeries) -> ArrowSe [value], name=series.name, backend_version=self._backend_version, + dtypes=self._dtypes, ) def _create_compliant_series(self, value: Any) -> ArrowSeries: @@ -101,26 +82,28 @@ def _create_compliant_series(self, value: Any) -> ArrowSeries: native_series=pa.chunked_array([value]), name="", backend_version=self._backend_version, + dtypes=self._dtypes, ) # --- not in spec --- - def __init__(self, *, backend_version: tuple[int, ...]) -> None: + def __init__(self, *, backend_version: tuple[int, ...], dtypes: DTypes) -> None: self._backend_version = backend_version self._implementation = Implementation.PYARROW + self._dtypes = dtypes # --- selection --- def col(self, *column_names: str) -> ArrowExpr: from narwhals._arrow.expr import ArrowExpr return ArrowExpr.from_column_names( - *column_names, backend_version=self._backend_version + *column_names, backend_version=self._backend_version, dtypes=self._dtypes ) def nth(self, *column_indices: int) -> ArrowExpr: from narwhals._arrow.expr import ArrowExpr return ArrowExpr.from_column_indices( - *column_indices, backend_version=self._backend_version + *column_indices, backend_version=self._backend_version, dtypes=self._dtypes ) def len(self) -> ArrowExpr: @@ -131,6 +114,7 @@ def len(self) -> ArrowExpr: [len(df._native_frame)], name="len", backend_version=self._backend_version, + dtypes=self._dtypes, ) ], depth=0, @@ -138,6 +122,7 @@ def len(self) -> ArrowExpr: root_names=None, output_names=["len"], backend_version=self._backend_version, + dtypes=self._dtypes, ) def all(self) -> ArrowExpr: @@ -150,6 +135,7 @@ def all(self) -> ArrowExpr: df._native_frame[column_name], name=column_name, backend_version=df._backend_version, + dtypes=df._dtypes, ) for column_name in df.columns ], @@ -158,14 +144,16 @@ def all(self) -> ArrowExpr: root_names=None, output_names=None, backend_version=self._backend_version, + dtypes=self._dtypes, ) - def lit(self, value: Any, dtype: dtypes.DType | None) -> ArrowExpr: + def lit(self, value: Any, dtype: DType | None) -> ArrowExpr: def _lit_arrow_series(_: ArrowDataFrame) -> ArrowSeries: arrow_series = ArrowSeries._from_iterable( data=[value], name="lit", backend_version=self._backend_version, + dtypes=self._dtypes, ) if dtype: return arrow_series.cast(dtype) @@ -178,6 +166,7 @@ def _lit_arrow_series(_: ArrowDataFrame) -> ArrowSeries: root_names=None, output_names=["lit"], backend_version=self._backend_version, + dtypes=self._dtypes, ) def all_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: @@ -230,7 +219,7 @@ def mean_horizontal(self, *exprs: IntoArrowExpr) -> IntoArrowExpr: total = reduce(lambda x, y: x + y, (e.fill_null(0.0) for e in arrow_exprs)) n_non_zero = reduce( lambda x, y: x + y, - ((1 - e.is_null().cast(self.Int64())) for e in arrow_exprs), + ((1 - e.is_null().cast(self._dtypes.Int64())) for e in arrow_exprs), ) return total / n_non_zero @@ -246,54 +235,54 @@ def concat( return ArrowDataFrame( horizontal_concat(dfs), backend_version=self._backend_version, + dtypes=self._dtypes, ) if how == "vertical": return ArrowDataFrame( vertical_concat(dfs), backend_version=self._backend_version, + dtypes=self._dtypes, ) raise NotImplementedError def sum(self, *column_names: str) -> ArrowExpr: return ArrowExpr.from_column_names( - *column_names, - backend_version=self._backend_version, + *column_names, backend_version=self._backend_version, dtypes=self._dtypes ).sum() def mean(self, *column_names: str) -> ArrowExpr: return ArrowExpr.from_column_names( - *column_names, - backend_version=self._backend_version, + *column_names, backend_version=self._backend_version, dtypes=self._dtypes ).mean() def max(self, *column_names: str) -> ArrowExpr: return ArrowExpr.from_column_names( - *column_names, - backend_version=self._backend_version, + *column_names, backend_version=self._backend_version, dtypes=self._dtypes ).max() def min(self, *column_names: str) -> ArrowExpr: return ArrowExpr.from_column_names( - *column_names, - backend_version=self._backend_version, + *column_names, backend_version=self._backend_version, dtypes=self._dtypes ).min() @property def selectors(self) -> ArrowSelectorNamespace: - return ArrowSelectorNamespace(backend_version=self._backend_version) + return ArrowSelectorNamespace( + backend_version=self._backend_version, dtypes=self._dtypes + ) def when( self, *predicates: IntoArrowExpr, ) -> ArrowWhen: - plx = self.__class__(backend_version=self._backend_version) + plx = self.__class__(backend_version=self._backend_version, dtypes=self._dtypes) if predicates: condition = plx.all_horizontal(*predicates) else: msg = "at least one predicate needs to be provided" raise TypeError(msg) - return ArrowWhen(condition, self._backend_version) + return ArrowWhen(condition, self._backend_version, dtypes=self._dtypes) class ArrowWhen: @@ -303,11 +292,14 @@ def __init__( backend_version: tuple[int, ...], then_value: Any = None, otherwise_value: Any = None, + *, + dtypes: DTypes, ) -> None: self._backend_version = backend_version self._condition = condition self._then_value = then_value self._otherwise_value = otherwise_value + self._dtypes = dtypes def __call__(self, df: ArrowDataFrame) -> list[ArrowSeries]: import pyarrow as pa # ignore-banned-import @@ -316,7 +308,7 @@ def __call__(self, df: ArrowDataFrame) -> list[ArrowSeries]: from narwhals._arrow.namespace import ArrowNamespace from narwhals._expression_parsing import parse_into_expr - plx = ArrowNamespace(backend_version=self._backend_version) + plx = ArrowNamespace(backend_version=self._backend_version, dtypes=self._dtypes) condition = parse_into_expr(self._condition, namespace=plx)._call(df)[0] # type: ignore[arg-type] try: @@ -327,6 +319,7 @@ def __call__(self, df: ArrowDataFrame) -> list[ArrowSeries]: [self._then_value] * len(condition), name="literal", backend_version=self._backend_version, + dtypes=self._dtypes, ) value_series = cast(ArrowSeries, value_series) @@ -370,6 +363,7 @@ def then(self, value: ArrowExpr | ArrowSeries | Any) -> ArrowThen: root_names=None, output_names=None, backend_version=self._backend_version, + dtypes=self._dtypes, ) @@ -383,9 +377,10 @@ def __init__( root_names: list[str] | None, output_names: list[str] | None, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._backend_version = backend_version - + self._dtypes = dtypes self._call = call self._depth = depth self._function_name = function_name diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index 569724c45..d5a8ccae0 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -4,7 +4,6 @@ from typing import Any from typing import NoReturn -from narwhals import dtypes from narwhals._arrow.expr import ArrowExpr from narwhals.utils import Implementation @@ -14,12 +13,14 @@ from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.series import ArrowSeries from narwhals.dtypes import DType + from narwhals.typing import DTypes class ArrowSelectorNamespace: - def __init__(self: Self, *, backend_version: tuple[int, ...]) -> None: + def __init__(self: Self, *, backend_version: tuple[int, ...], dtypes: DTypes) -> None: self._backend_version = backend_version self._implementation = Implementation.PYARROW + self._dtypes = dtypes def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> ArrowSelector: def func(df: ArrowDataFrame) -> list[ArrowSeries]: @@ -32,32 +33,33 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: root_names=None, output_names=None, backend_version=self._backend_version, + dtypes=self._dtypes, ) def numeric(self: Self) -> ArrowSelector: return self.by_dtype( [ - dtypes.Int64, - dtypes.Int32, - dtypes.Int16, - dtypes.Int8, - dtypes.UInt64, - dtypes.UInt32, - dtypes.UInt16, - dtypes.UInt8, - dtypes.Float64, - dtypes.Float32, + self._dtypes.Int64, + self._dtypes.Int32, + self._dtypes.Int16, + self._dtypes.Int8, + self._dtypes.UInt64, + self._dtypes.UInt32, + self._dtypes.UInt16, + self._dtypes.UInt8, + self._dtypes.Float64, + self._dtypes.Float32, ], ) def categorical(self: Self) -> ArrowSelector: - return self.by_dtype([dtypes.Categorical]) + return self.by_dtype([self._dtypes.Categorical]) def string(self: Self) -> ArrowSelector: - return self.by_dtype([dtypes.String]) + return self.by_dtype([self._dtypes.String]) def boolean(self: Self) -> ArrowSelector: - return self.by_dtype([dtypes.Boolean]) + return self.by_dtype([self._dtypes.Boolean]) def all(self: Self) -> ArrowSelector: def func(df: ArrowDataFrame) -> list[ArrowSeries]: @@ -70,6 +72,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: root_names=None, output_names=None, backend_version=self._backend_version, + dtypes=self._dtypes, ) @@ -91,6 +94,7 @@ def _to_expr(self: Self) -> ArrowExpr: root_names=self._root_names, output_names=self._output_names, backend_version=self._backend_version, + dtypes=self._dtypes, ) def __sub__(self: Self, other: Self | Any) -> ArrowSelector | Any: @@ -108,6 +112,7 @@ def call(df: ArrowDataFrame) -> list[ArrowSeries]: root_names=None, output_names=None, backend_version=self._backend_version, + dtypes=self._dtypes, ) else: return self._to_expr() - other @@ -127,6 +132,7 @@ def call(df: ArrowDataFrame) -> list[ArrowSeries]: root_names=None, output_names=None, backend_version=self._backend_version, + dtypes=self._dtypes, ) else: return self._to_expr() | other @@ -146,12 +152,18 @@ def call(df: ArrowDataFrame) -> list[ArrowSeries]: root_names=None, output_names=None, backend_version=self._backend_version, + dtypes=self._dtypes, ) else: return self._to_expr() & other def __invert__(self: Self) -> ArrowSelector: - return ArrowSelectorNamespace(backend_version=self._backend_version).all() - self + return ( + ArrowSelectorNamespace( + backend_version=self._backend_version, dtypes=self._dtypes + ).all() + - self + ) def __rsub__(self: Self, other: Any) -> NoReturn: raise NotImplementedError diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index a3c3e89e9..183cf37b7 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -25,6 +25,7 @@ from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.namespace import ArrowNamespace from narwhals.dtypes import DType + from narwhals.typing import DTypes class ArrowSeries: @@ -34,11 +35,13 @@ def __init__( *, name: str, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._name = name self._native_series = native_series self._implementation = Implementation.PYARROW self._backend_version = backend_version + self._dtypes = dtypes def _from_native_series(self, series: Any) -> Self: import pyarrow as pa # ignore-banned-import() @@ -49,6 +52,7 @@ def _from_native_series(self, series: Any) -> Self: series, name=self._name, backend_version=self._backend_version, + dtypes=self._dtypes, ) @classmethod @@ -58,6 +62,7 @@ def _from_iterable( name: str, *, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> Self: import pyarrow as pa # ignore-banned-import() @@ -65,12 +70,13 @@ def _from_iterable( pa.chunked_array([data]), name=name, backend_version=backend_version, + dtypes=dtypes, ) def __narwhals_namespace__(self) -> ArrowNamespace: from narwhals._arrow.namespace import ArrowNamespace - return ArrowNamespace(backend_version=self._backend_version) + return ArrowNamespace(backend_version=self._backend_version, dtypes=self._dtypes) def __len__(self) -> int: return len(self._native_series) @@ -361,11 +367,12 @@ def alias(self, name: str) -> Self: self._native_series, name=name, backend_version=self._backend_version, + dtypes=self._dtypes, ) @property def dtype(self: Self) -> DType: - return native_to_narwhals_dtype(self._native_series.type) + return native_to_narwhals_dtype(self._native_series.type, self._dtypes) def abs(self) -> Self: import pyarrow.compute as pc # ignore-banned-import() @@ -438,7 +445,7 @@ def cast(self, dtype: DType) -> Self: import pyarrow.compute as pc # ignore-banned-import() ser = self._native_series - dtype = narwhals_to_native_dtype(dtype) + dtype = narwhals_to_native_dtype(dtype, self._dtypes) return self._from_native_series(pc.cast(ser, dtype)) def null_count(self: Self) -> int: @@ -474,7 +481,10 @@ def arg_true(self) -> Self: ser = self._native_series res = np.flatnonzero(ser) return self._from_iterable( - res, name=self.name, backend_version=self._backend_version + res, + name=self.name, + backend_version=self._backend_version, + dtypes=self._dtypes, ) def item(self: Self, index: int | None = None) -> Any: @@ -520,8 +530,7 @@ def value_counts( val_count = val_count.sort_by([(value_name_, "descending")]) return ArrowDataFrame( - val_count, - backend_version=self._backend_version, + val_count, backend_version=self._backend_version, dtypes=self._dtypes ) def zip_with(self: Self, mask: Self, other: Self) -> Self: @@ -574,7 +583,9 @@ def to_frame(self: Self) -> ArrowDataFrame: from narwhals._arrow.dataframe import ArrowDataFrame df = pa.Table.from_arrays([self._native_series], names=[self.name]) - return ArrowDataFrame(df, backend_version=self._backend_version) + return ArrowDataFrame( + df, backend_version=self._backend_version, dtypes=self._dtypes + ) def to_pandas(self: Self) -> Any: import pandas as pd # ignore-banned-import() @@ -670,6 +681,7 @@ def to_dummies( return ArrowDataFrame( pa.Table.from_arrays(columns, names=names), backend_version=self._backend_version, + dtypes=self._dtypes, ).select(*sorted(names)[int(drop_first) :]) def quantile( diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 90195a386..d51a4b25d 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -4,16 +4,17 @@ from typing import Any from typing import Sequence -from narwhals import dtypes from narwhals.utils import isinstance_or_issubclass if TYPE_CHECKING: import pyarrow as pa from narwhals._arrow.series import ArrowSeries + from narwhals.dtypes import DType + from narwhals.typing import DTypes -def native_to_narwhals_dtype(dtype: Any) -> dtypes.DType: +def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: import pyarrow as pa # ignore-banned-import if pa.types.is_int64(dtype): @@ -63,11 +64,9 @@ def native_to_narwhals_dtype(dtype: Any) -> dtypes.DType: return dtypes.Unknown() # pragma: no cover -def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: +def narwhals_to_native_dtype(dtype: DType | type[DType], dtypes: DTypes) -> Any: import pyarrow as pa # ignore-banned-import - from narwhals import dtypes - if isinstance_or_issubclass(dtype, dtypes.Float64): return pa.float64() if isinstance_or_issubclass(dtype, dtypes.Float32): diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 9538d6d89..916583eaa 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -26,15 +26,21 @@ from narwhals._dask.namespace import DaskNamespace from narwhals._dask.typing import IntoDaskExpr from narwhals.dtypes import DType + from narwhals.typing import DTypes class DaskLazyFrame: def __init__( - self, native_dataframe: dd.DataFrame, *, backend_version: tuple[int, ...] + self, + native_dataframe: dd.DataFrame, + *, + backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._native_frame = native_dataframe self._backend_version = backend_version self._implementation = Implementation.DASK + self._dtypes = dtypes def __native_namespace__(self: Self) -> ModuleType: if self._implementation is Implementation.DASK: @@ -46,13 +52,15 @@ def __native_namespace__(self: Self) -> ModuleType: def __narwhals_namespace__(self) -> DaskNamespace: from narwhals._dask.namespace import DaskNamespace - return DaskNamespace(backend_version=self._backend_version) + return DaskNamespace(backend_version=self._backend_version, dtypes=self._dtypes) def __narwhals_lazyframe__(self) -> Self: return self def _from_native_frame(self, df: Any) -> Self: - return self.__class__(df, backend_version=self._backend_version) + return self.__class__( + df, backend_version=self._backend_version, dtypes=self._dtypes + ) def with_columns(self, *exprs: DaskExpr, **named_exprs: DaskExpr) -> Self: df = self._native_frame @@ -70,6 +78,7 @@ def collect(self) -> Any: result, implementation=Implementation.PANDAS, backend_version=parse_version(pd.__version__), + dtypes=self._dtypes, ) @property @@ -92,7 +101,7 @@ def filter( from narwhals._dask.namespace import DaskNamespace - plx = DaskNamespace(backend_version=self._backend_version) + plx = DaskNamespace(backend_version=self._backend_version, dtypes=self._dtypes) expr = plx.all_horizontal(*predicates) # Safety: all_horizontal's expression only returns a single column. mask = expr._call(self)[0] @@ -140,7 +149,7 @@ def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: @property def schema(self) -> dict[str, DType]: return { - col: native_to_narwhals_dtype(self._native_frame.loc[:, col]) + col: native_to_narwhals_dtype(self._native_frame.loc[:, col], self._dtypes) for col in self._native_frame.columns } diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index eda0fd589..d8d86692e 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -19,6 +19,7 @@ from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.namespace import DaskNamespace from narwhals.dtypes import DType + from narwhals.typing import DTypes class DaskExpr: @@ -34,6 +35,7 @@ def __init__( # a reduction, such as `nw.col('a').sum()` returns_scalar: bool, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._call = call self._depth = depth @@ -42,6 +44,7 @@ def __init__( self._output_names = output_names self._returns_scalar = returns_scalar self._backend_version = backend_version + self._dtypes = dtypes def __narwhals_expr__(self) -> None: ... @@ -49,13 +52,14 @@ def __narwhals_namespace__(self) -> DaskNamespace: # pragma: no cover # Unused, just for compatibility with PandasLikeExpr from narwhals._dask.namespace import DaskNamespace - return DaskNamespace(backend_version=self._backend_version) + return DaskNamespace(backend_version=self._backend_version, dtypes=self._dtypes) @classmethod def from_column_names( cls: type[Self], *column_names: str, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> Self: def func(df: DaskLazyFrame) -> list[dask_expr.Series]: return [df._native_frame.loc[:, column_name] for column_name in column_names] @@ -68,6 +72,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: output_names=list(column_names), returns_scalar=False, backend_version=backend_version, + dtypes=dtypes, ) @classmethod @@ -75,6 +80,7 @@ def from_column_indices( cls: type[Self], *column_indices: int, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> Self: def func(df: DaskLazyFrame) -> list[dask_expr.Series]: return [ @@ -89,6 +95,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: output_names=None, returns_scalar=False, backend_version=backend_version, + dtypes=dtypes, ) def _from_call( @@ -146,6 +153,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: output_names=output_names, returns_scalar=self._returns_scalar or returns_scalar, backend_version=self._backend_version, + dtypes=self._dtypes, ) def alias(self, name: str) -> Self: @@ -161,6 +169,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: output_names=[name], returns_scalar=self._returns_scalar, backend_version=self._backend_version, + dtypes=self._dtypes, ) def __add__(self, other: Any) -> Self: @@ -677,6 +686,7 @@ def func(df: DaskLazyFrame) -> list[Any]: output_names=self._output_names, returns_scalar=False, backend_version=self._backend_version, + dtypes=self._dtypes, ) def mode(self: Self) -> Self: @@ -700,7 +710,7 @@ def cast( dtype: DType | type[DType], ) -> Self: def func(_input: Any, dtype: DType | type[DType]) -> Any: - dtype = narwhals_to_native_dtype(dtype) + dtype = narwhals_to_native_dtype(dtype, self._dtypes) return _input.astype(dtype) return self._from_call( @@ -977,6 +987,7 @@ def keep(self: Self) -> DaskExpr: output_names=root_names, returns_scalar=self._expr._returns_scalar, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def map(self: Self, function: Callable[[str], str]) -> DaskExpr: @@ -1003,6 +1014,7 @@ def map(self: Self, function: Callable[[str], str]) -> DaskExpr: output_names=output_names, returns_scalar=self._expr._returns_scalar, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def prefix(self: Self, prefix: str) -> DaskExpr: @@ -1027,6 +1039,7 @@ def prefix(self: Self, prefix: str) -> DaskExpr: output_names=output_names, returns_scalar=self._expr._returns_scalar, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def suffix(self: Self, suffix: str) -> DaskExpr: @@ -1052,6 +1065,7 @@ def suffix(self: Self, suffix: str) -> DaskExpr: output_names=output_names, returns_scalar=self._expr._returns_scalar, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def to_lowercase(self: Self) -> DaskExpr: @@ -1077,6 +1091,7 @@ def to_lowercase(self: Self) -> DaskExpr: output_names=output_names, returns_scalar=self._expr._returns_scalar, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def to_uppercase(self: Self) -> DaskExpr: @@ -1102,4 +1117,5 @@ def to_uppercase(self: Self) -> DaskExpr: output_names=output_names, returns_scalar=self._expr._returns_scalar, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py index d79c95d7b..55ef69f46 100644 --- a/narwhals/_dask/group_by.py +++ b/narwhals/_dask/group_by.py @@ -84,8 +84,7 @@ def _from_native_frame(self, df: DaskLazyFrame) -> DaskLazyFrame: from narwhals._dask.dataframe import DaskLazyFrame return DaskLazyFrame( - df, - backend_version=self._df._backend_version, + df, backend_version=self._df._backend_version, dtypes=self._df._dtypes ) diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 39c647192..01d5bea48 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -9,7 +9,6 @@ from typing import NoReturn from typing import cast -from narwhals import dtypes from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr from narwhals._dask.selectors import DaskSelectorNamespace @@ -24,38 +23,19 @@ from narwhals._dask.typing import IntoDaskExpr from narwhals.dtypes import DType + from narwhals.typing import DTypes class DaskNamespace: - Int64 = dtypes.Int64 - Int32 = dtypes.Int32 - Int16 = dtypes.Int16 - Int8 = dtypes.Int8 - UInt64 = dtypes.UInt64 - UInt32 = dtypes.UInt32 - UInt16 = dtypes.UInt16 - UInt8 = dtypes.UInt8 - Float64 = dtypes.Float64 - Float32 = dtypes.Float32 - Boolean = dtypes.Boolean - Object = dtypes.Object - Unknown = dtypes.Unknown - Categorical = dtypes.Categorical - Enum = dtypes.Enum - String = dtypes.String - Datetime = dtypes.Datetime - Duration = dtypes.Duration - Date = dtypes.Date - List = dtypes.List - Struct = dtypes.Struct - Array = dtypes.Array - @property def selectors(self) -> DaskSelectorNamespace: - return DaskSelectorNamespace(backend_version=self._backend_version) + return DaskSelectorNamespace( + backend_version=self._backend_version, dtypes=self._dtypes + ) - def __init__(self, *, backend_version: tuple[int, ...]) -> None: + def __init__(self, *, backend_version: tuple[int, ...], dtypes: DTypes) -> None: self._backend_version = backend_version + self._dtypes = dtypes def all(self) -> DaskExpr: def func(df: DaskLazyFrame) -> list[dask_expr.Series]: @@ -69,25 +49,28 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: output_names=None, returns_scalar=False, backend_version=self._backend_version, + dtypes=self._dtypes, ) def col(self, *column_names: str) -> DaskExpr: return DaskExpr.from_column_names( - *column_names, - backend_version=self._backend_version, + *column_names, backend_version=self._backend_version, dtypes=self._dtypes ) def nth(self, *column_indices: int) -> DaskExpr: return DaskExpr.from_column_indices( - *column_indices, - backend_version=self._backend_version, + *column_indices, backend_version=self._backend_version, dtypes=self._dtypes ) - def lit(self, value: Any, dtype: dtypes.DType | None) -> DaskExpr: + def lit(self, value: Any, dtype: DType | None) -> DaskExpr: def convert_if_dtype( series: dask_expr.Series, dtype: DType | type[DType] ) -> dask_expr.Series: - return series.astype(narwhals_to_native_dtype(dtype)) if dtype else series + return ( + series.astype(narwhals_to_native_dtype(dtype, self._dtypes)) + if dtype + else series + ) return DaskExpr( lambda df: [ @@ -101,30 +84,27 @@ def convert_if_dtype( output_names=["lit"], returns_scalar=False, backend_version=self._backend_version, + dtypes=self._dtypes, ) def min(self, *column_names: str) -> DaskExpr: return DaskExpr.from_column_names( - *column_names, - backend_version=self._backend_version, + *column_names, backend_version=self._backend_version, dtypes=self._dtypes ).min() def max(self, *column_names: str) -> DaskExpr: return DaskExpr.from_column_names( - *column_names, - backend_version=self._backend_version, + *column_names, backend_version=self._backend_version, dtypes=self._dtypes ).max() def mean(self, *column_names: str) -> DaskExpr: return DaskExpr.from_column_names( - *column_names, - backend_version=self._backend_version, + *column_names, backend_version=self._backend_version, dtypes=self._dtypes ).mean() def sum(self, *column_names: str) -> DaskExpr: return DaskExpr.from_column_names( - *column_names, - backend_version=self._backend_version, + *column_names, backend_version=self._backend_version, dtypes=self._dtypes ).sum() def len(self) -> DaskExpr: @@ -150,6 +130,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: output_names=["len"], returns_scalar=True, backend_version=self._backend_version, + dtypes=self._dtypes, ) def all_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: @@ -167,6 +148,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: output_names=reduce_output_names(parsed_exprs), returns_scalar=False, backend_version=self._backend_version, + dtypes=self._dtypes, ) def any_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: @@ -184,6 +166,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: output_names=reduce_output_names(parsed_exprs), returns_scalar=False, backend_version=self._backend_version, + dtypes=self._dtypes, ) def sum_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: @@ -201,6 +184,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: output_names=reduce_output_names(parsed_exprs), returns_scalar=False, backend_version=self._backend_version, + dtypes=self._dtypes, ) def concat( @@ -224,6 +208,7 @@ def concat( return DaskLazyFrame( dd.concat(native_frames, axis=0, join="inner"), backend_version=self._backend_version, + dtypes=self._dtypes, ) if how == "horizontal": all_column_names: list[str] = [ @@ -241,6 +226,7 @@ def concat( return DaskLazyFrame( dd.concat(native_frames, axis=1, join="outer"), backend_version=self._backend_version, + dtypes=self._dtypes, ) raise NotImplementedError @@ -282,14 +268,16 @@ def when( self, *predicates: IntoDaskExpr, ) -> DaskWhen: - plx = self.__class__(backend_version=self._backend_version) + plx = self.__class__(backend_version=self._backend_version, dtypes=self._dtypes) if predicates: condition = plx.all_horizontal(*predicates) else: msg = "at least one predicate needs to be provided" raise TypeError(msg) - return DaskWhen(condition, self._backend_version, returns_scalar=False) + return DaskWhen( + condition, self._backend_version, returns_scalar=False, dtypes=self._dtypes + ) class DaskWhen: @@ -301,18 +289,20 @@ def __init__( otherwise_value: Any = None, *, returns_scalar: bool, + dtypes: DTypes, ) -> None: self._backend_version = backend_version self._condition = condition self._then_value = then_value self._otherwise_value = otherwise_value self._returns_scalar = returns_scalar + self._dtypes = dtypes def __call__(self, df: DaskLazyFrame) -> list[dask_expr.Series]: from narwhals._dask.namespace import DaskNamespace from narwhals._expression_parsing import parse_into_expr - plx = DaskNamespace(backend_version=self._backend_version) + plx = DaskNamespace(backend_version=self._backend_version, dtypes=self._dtypes) condition = parse_into_expr(self._condition, namespace=plx)._call(df)[0] # type: ignore[arg-type] condition = cast("dask_expr.Series", condition) @@ -349,6 +339,7 @@ def then(self, value: DaskExpr | Any) -> DaskThen: output_names=None, returns_scalar=self._returns_scalar, backend_version=self._backend_version, + dtypes=self._dtypes, ) @@ -363,9 +354,10 @@ def __init__( output_names: list[str] | None, returns_scalar: bool, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._backend_version = backend_version - + self._dtypes = dtypes self._call = call self._depth = depth self._function_name = function_name diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 54131a8a5..4d9af1110 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -4,7 +4,6 @@ from typing import Any from typing import NoReturn -from narwhals import dtypes from narwhals._dask.expr import DaskExpr if TYPE_CHECKING: @@ -13,11 +12,13 @@ from narwhals._dask.dataframe import DaskLazyFrame from narwhals.dtypes import DType + from narwhals.typing import DTypes class DaskSelectorNamespace: - def __init__(self: Self, *, backend_version: tuple[int, ...]) -> None: + def __init__(self: Self, *, backend_version: tuple[int, ...], dtypes: DTypes) -> None: self._backend_version = backend_version + self._dtypes = dtypes def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> DaskSelector: def func(df: DaskLazyFrame) -> list[Any]: @@ -33,32 +34,33 @@ def func(df: DaskLazyFrame) -> list[Any]: output_names=None, backend_version=self._backend_version, returns_scalar=False, + dtypes=self._dtypes, ) def numeric(self: Self) -> DaskSelector: return self.by_dtype( [ - dtypes.Int64, - dtypes.Int32, - dtypes.Int16, - dtypes.Int8, - dtypes.UInt64, - dtypes.UInt32, - dtypes.UInt16, - dtypes.UInt8, - dtypes.Float64, - dtypes.Float32, + self._dtypes.Int64, + self._dtypes.Int32, + self._dtypes.Int16, + self._dtypes.Int8, + self._dtypes.UInt64, + self._dtypes.UInt32, + self._dtypes.UInt16, + self._dtypes.UInt8, + self._dtypes.Float64, + self._dtypes.Float32, ], ) def categorical(self: Self) -> DaskSelector: - return self.by_dtype([dtypes.Categorical]) + return self.by_dtype([self._dtypes.Categorical]) def string(self: Self) -> DaskSelector: - return self.by_dtype([dtypes.String]) + return self.by_dtype([self._dtypes.String]) def boolean(self: Self) -> DaskSelector: - return self.by_dtype([dtypes.Boolean]) + return self.by_dtype([self._dtypes.Boolean]) def all(self: Self) -> DaskSelector: def func(df: DaskLazyFrame) -> list[Any]: @@ -72,6 +74,7 @@ def func(df: DaskLazyFrame) -> list[Any]: output_names=None, backend_version=self._backend_version, returns_scalar=False, + dtypes=self._dtypes, ) @@ -94,6 +97,7 @@ def _to_expr(self: Self) -> DaskExpr: output_names=self._output_names, backend_version=self._backend_version, returns_scalar=self._returns_scalar, + dtypes=self._dtypes, ) def __sub__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: @@ -112,6 +116,7 @@ def call(df: DaskLazyFrame) -> list[Any]: output_names=None, backend_version=self._backend_version, returns_scalar=self._returns_scalar, + dtypes=self._dtypes, ) else: return self._to_expr() - other @@ -132,6 +137,7 @@ def call(df: DaskLazyFrame) -> list[dask_expr.Series]: output_names=None, backend_version=self._backend_version, returns_scalar=self._returns_scalar, + dtypes=self._dtypes, ) else: return self._to_expr() | other @@ -152,12 +158,18 @@ def call(df: DaskLazyFrame) -> list[Any]: output_names=None, backend_version=self._backend_version, returns_scalar=self._returns_scalar, + dtypes=self._dtypes, ) else: return self._to_expr() & other def __invert__(self: Self) -> DaskSelector: - return DaskSelectorNamespace(backend_version=self._backend_version).all() - self + return ( + DaskSelectorNamespace( + backend_version=self._backend_version, dtypes=self._dtypes + ).all() + - self + ) def __rsub__(self: Self, other: Any) -> NoReturn: raise NotImplementedError diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index 02dedab4e..f7636bd5f 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -14,6 +14,7 @@ from narwhals._dask.dataframe import DaskLazyFrame from narwhals.dtypes import DType + from narwhals.typing import DTypes def maybe_evaluate(df: DaskLazyFrame, obj: Any) -> Any: @@ -83,9 +84,7 @@ def validate_comparand(lhs: dask_expr.Series, rhs: dask_expr.Series) -> None: raise RuntimeError(msg) -def narwhals_to_native_dtype(dtype: DType | type[DType]) -> Any: - from narwhals import dtypes - +def narwhals_to_native_dtype(dtype: DType | type[DType], dtypes: DTypes) -> Any: if isinstance_or_issubclass(dtype, dtypes.Float64): return "float64" if isinstance_or_issubclass(dtype, dtypes.Float32): diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 099a91b72..5e8cb73d3 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -4,7 +4,6 @@ from typing import TYPE_CHECKING from typing import Any -from narwhals import dtypes from narwhals.utils import parse_version if TYPE_CHECKING: @@ -13,11 +12,11 @@ from typing_extensions import Self from narwhals._duckdb.series import DuckDBInterchangeSeries + from narwhals.dtypes import DType + from narwhals.typing import DTypes -def map_duckdb_dtype_to_narwhals_dtype( - duckdb_dtype: Any, -) -> dtypes.DType: +def map_duckdb_dtype_to_narwhals_dtype(duckdb_dtype: Any, dtypes: DTypes) -> DType: duckdb_dtype = str(duckdb_dtype) if duckdb_dtype == "BIGINT": return dtypes.Int64() @@ -59,8 +58,9 @@ def map_duckdb_dtype_to_narwhals_dtype( class DuckDBInterchangeFrame: - def __init__(self, df: Any) -> None: + def __init__(self, df: Any, dtypes: DTypes) -> None: self._native_frame = df + self._dtypes = dtypes def __narwhals_dataframe__(self) -> Any: return self @@ -68,12 +68,16 @@ def __narwhals_dataframe__(self) -> Any: def __getitem__(self, item: str) -> DuckDBInterchangeSeries: from narwhals._duckdb.series import DuckDBInterchangeSeries - return DuckDBInterchangeSeries(self._native_frame.select(item)) + return DuckDBInterchangeSeries( + self._native_frame.select(item), dtypes=self._dtypes + ) def __getattr__(self, attr: str) -> Any: if attr == "schema": return { - column_name: map_duckdb_dtype_to_narwhals_dtype(duckdb_dtype) + column_name: map_duckdb_dtype_to_narwhals_dtype( + duckdb_dtype, self._dtypes + ) for column_name, duckdb_dtype in zip( self._native_frame.columns, self._native_frame.types ) diff --git a/narwhals/_duckdb/series.py b/narwhals/_duckdb/series.py index f19a6f76f..a7dbdd549 100644 --- a/narwhals/_duckdb/series.py +++ b/narwhals/_duckdb/series.py @@ -1,20 +1,27 @@ from __future__ import annotations +from typing import TYPE_CHECKING from typing import Any from narwhals._duckdb.dataframe import map_duckdb_dtype_to_narwhals_dtype +if TYPE_CHECKING: + from narwhals.typing import DTypes + class DuckDBInterchangeSeries: - def __init__(self, df: Any) -> None: + def __init__(self, df: Any, dtypes: DTypes) -> None: self._native_series = df + self._dtypes = dtypes def __narwhals_series__(self) -> Any: return self def __getattr__(self, attr: str) -> Any: if attr == "dtype": - return map_duckdb_dtype_to_narwhals_dtype(self._native_series.types[0]) + return map_duckdb_dtype_to_narwhals_dtype( + self._native_series.types[0], self._dtypes + ) msg = ( # pragma: no cover f"Attribute {attr} is not supported for metadata-only dataframes.\n\n" "If you would like to see this kind of object better supported in " diff --git a/narwhals/_ibis/dataframe.py b/narwhals/_ibis/dataframe.py index f0dc8f6eb..6f53e277d 100644 --- a/narwhals/_ibis/dataframe.py +++ b/narwhals/_ibis/dataframe.py @@ -3,19 +3,17 @@ from typing import TYPE_CHECKING from typing import Any -from narwhals import dtypes - if TYPE_CHECKING: import pandas as pd import pyarrow as pa from typing_extensions import Self from narwhals._ibis.series import IbisInterchangeSeries + from narwhals.dtypes import DType + from narwhals.typing import DTypes -def map_ibis_dtype_to_narwhals_dtype( - ibis_dtype: Any, -) -> dtypes.DType: +def map_ibis_dtype_to_narwhals_dtype(ibis_dtype: Any, dtypes: DTypes) -> DType: if ibis_dtype.is_int64(): return dtypes.Int64() if ibis_dtype.is_int32(): @@ -52,8 +50,9 @@ def map_ibis_dtype_to_narwhals_dtype( class IbisInterchangeFrame: - def __init__(self, df: Any) -> None: + def __init__(self, df: Any, dtypes: DTypes) -> None: self._native_frame = df + self._dtypes = dtypes def __narwhals_dataframe__(self) -> Any: return self @@ -61,7 +60,7 @@ def __narwhals_dataframe__(self) -> Any: def __getitem__(self, item: str) -> IbisInterchangeSeries: from narwhals._ibis.series import IbisInterchangeSeries - return IbisInterchangeSeries(self._native_frame[item]) + return IbisInterchangeSeries(self._native_frame[item], dtypes=self._dtypes) def to_pandas(self: Self) -> pd.DataFrame: return self._native_frame.to_pandas() @@ -72,7 +71,7 @@ def to_arrow(self: Self) -> pa.Table: def __getattr__(self, attr: str) -> Any: if attr == "schema": return { - column_name: map_ibis_dtype_to_narwhals_dtype(ibis_dtype) + column_name: map_ibis_dtype_to_narwhals_dtype(ibis_dtype, self._dtypes) for column_name, ibis_dtype in self._native_frame.schema().items() } msg = ( diff --git a/narwhals/_ibis/series.py b/narwhals/_ibis/series.py index 73e3b6d47..2f6cd6faa 100644 --- a/narwhals/_ibis/series.py +++ b/narwhals/_ibis/series.py @@ -1,20 +1,27 @@ from __future__ import annotations +from typing import TYPE_CHECKING from typing import Any from narwhals._ibis.dataframe import map_ibis_dtype_to_narwhals_dtype +if TYPE_CHECKING: + from narwhals.typing import DTypes + class IbisInterchangeSeries: - def __init__(self, df: Any) -> None: + def __init__(self, df: Any, dtypes: DTypes) -> None: self._native_series = df + self._dtypes = dtypes def __narwhals_series__(self) -> Any: return self def __getattr__(self, attr: str) -> Any: if attr == "dtype": - return map_ibis_dtype_to_narwhals_dtype(self._native_series.type()) + return map_ibis_dtype_to_narwhals_dtype( + self._native_series.type(), self._dtypes + ) msg = ( f"Attribute {attr} is not supported for metadata-only dataframes.\n\n" "If you would like to see this kind of object better supported in " diff --git a/narwhals/_interchange/dataframe.py b/narwhals/_interchange/dataframe.py index 975da216f..1dc671dc7 100644 --- a/narwhals/_interchange/dataframe.py +++ b/narwhals/_interchange/dataframe.py @@ -5,7 +5,6 @@ from typing import Any from typing import NoReturn -from narwhals import dtypes from narwhals.utils import parse_version if TYPE_CHECKING: @@ -14,6 +13,8 @@ from typing_extensions import Self from narwhals._interchange.series import InterchangeSeries + from narwhals.dtypes import DType + from narwhals.typing import DTypes class DtypeKind(enum.IntEnum): @@ -28,8 +29,8 @@ class DtypeKind(enum.IntEnum): def map_interchange_dtype_to_narwhals_dtype( - interchange_dtype: tuple[DtypeKind, int, Any, Any], -) -> dtypes.DType: + interchange_dtype: tuple[DtypeKind, int, Any, Any], dtypes: DTypes +) -> DType: if interchange_dtype[0] == DtypeKind.INT: if interchange_dtype[1] == 64: return dtypes.Int64() @@ -73,9 +74,10 @@ def map_interchange_dtype_to_narwhals_dtype( class InterchangeFrame: - def __init__(self, df: Any) -> None: + def __init__(self, df: Any, dtypes: DTypes) -> None: self._native_frame = df self._interchange_frame = df.__dataframe__() + self._dtypes = dtypes def __narwhals_dataframe__(self) -> Any: return self @@ -83,13 +85,16 @@ def __narwhals_dataframe__(self) -> Any: def __getitem__(self, item: str) -> InterchangeSeries: from narwhals._interchange.series import InterchangeSeries - return InterchangeSeries(self._interchange_frame.get_column_by_name(item)) + return InterchangeSeries( + self._interchange_frame.get_column_by_name(item), dtypes=self._dtypes + ) @property - def schema(self) -> dict[str, dtypes.DType]: + def schema(self) -> dict[str, DType]: return { column_name: map_interchange_dtype_to_narwhals_dtype( - self._interchange_frame.get_column_by_name(column_name).dtype + self._interchange_frame.get_column_by_name(column_name).dtype, + self._dtypes, ) for column_name in self._interchange_frame.column_names() } diff --git a/narwhals/_interchange/series.py b/narwhals/_interchange/series.py index 70f84d12f..00426e6c0 100644 --- a/narwhals/_interchange/series.py +++ b/narwhals/_interchange/series.py @@ -2,27 +2,27 @@ from typing import TYPE_CHECKING from typing import Any -from typing import NoReturn from narwhals._interchange.dataframe import map_interchange_dtype_to_narwhals_dtype if TYPE_CHECKING: - from narwhals.dtypes import DType + from narwhals.typing import DTypes class InterchangeSeries: - def __init__(self, df: Any) -> None: + def __init__(self, df: Any, dtypes: DTypes) -> None: self._native_series = df + self._dtypes = dtypes def __narwhals_series__(self) -> Any: return self - @property - def dtype(self) -> DType: - return map_interchange_dtype_to_narwhals_dtype(self._native_series.dtype) - - def __getattr__(self, attr: str) -> NoReturn: - msg = ( + def __getattr__(self, attr: str) -> Any: + if attr == "dtype": + return map_interchange_dtype_to_narwhals_dtype( + self._native_series.dtype, dtypes=self._dtypes + ) + msg = ( # pragma: no cover f"Attribute {attr} is not supported for metadata-only dataframes.\n\n" "Hint: you probably called `nw.from_native` on an object which isn't fully " "supported by Narwhals, yet implements `__dataframe__`. If you would like to " diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 47a6bb39b..aae86cef7 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -35,6 +35,7 @@ from narwhals._pandas_like.series import PandasLikeSeries from narwhals._pandas_like.typing import IntoPandasLikeExpr from narwhals.dtypes import DType + from narwhals.typing import DTypes class PandasLikeDataFrame: @@ -45,11 +46,13 @@ def __init__( *, implementation: Implementation, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._validate_columns(native_dataframe.columns) self._native_frame = native_dataframe self._implementation = implementation self._backend_version = backend_version + self._dtypes = dtypes def __narwhals_dataframe__(self) -> Self: return self @@ -60,7 +63,9 @@ def __narwhals_lazyframe__(self) -> Self: def __narwhals_namespace__(self) -> PandasLikeNamespace: from narwhals._pandas_like.namespace import PandasLikeNamespace - return PandasLikeNamespace(self._implementation, self._backend_version) + return PandasLikeNamespace( + self._implementation, self._backend_version, dtypes=self._dtypes + ) def __native_namespace__(self: Self) -> ModuleType: if self._implementation in { @@ -92,6 +97,7 @@ def _from_native_frame(self, df: Any) -> Self: df, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def get_column(self, name: str) -> PandasLikeSeries: @@ -101,6 +107,7 @@ def get_column(self, name: str) -> PandasLikeSeries: self._native_frame.loc[:, name], implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def __array__(self, dtype: Any = None, copy: bool | None = None) -> np.ndarray: @@ -153,6 +160,7 @@ def __getitem__( self._native_frame.loc[:, item], implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) elif ( @@ -208,6 +216,7 @@ def __getitem__( native_series, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) elif is_sequence_but_not_str(item) or (is_numpy_array(item) and item.ndim == 1): @@ -265,7 +274,7 @@ def iter_rows( @property def schema(self) -> dict[str, DType]: return { - col: native_to_narwhals_dtype(self._native_frame.loc[:, col]) + col: native_to_narwhals_dtype(self._native_frame.loc[:, col], self._dtypes) for col in self._native_frame.columns } @@ -306,6 +315,7 @@ def with_row_index(self, name: str) -> Self: index=self._native_frame.index, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ).alias(name) return self._from_native_frame( horizontal_concat( @@ -417,6 +427,7 @@ def collect(self) -> PandasLikeDataFrame: self._native_frame, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) # --- actions --- @@ -623,6 +634,7 @@ def to_dict(self, *, as_series: bool = False) -> dict[str, Any]: self._native_frame.loc[:, col], implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) for col in self.columns } @@ -672,6 +684,7 @@ def is_duplicated(self: Self) -> PandasLikeSeries: self._native_frame.duplicated(keep=False), implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def is_empty(self: Self) -> bool: @@ -684,6 +697,7 @@ def is_unique(self: Self) -> PandasLikeSeries: ~self._native_frame.duplicated(keep=False), implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def null_count(self: Self) -> PandasLikeDataFrame: @@ -691,6 +705,7 @@ def null_count(self: Self) -> PandasLikeDataFrame: self._native_frame.isna().sum(axis=0).to_frame().transpose(), implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def item(self: Self, row: int | None = None, column: int | str | None = None) -> Any: diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 512699515..52c237aaa 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -14,6 +14,7 @@ from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.namespace import PandasLikeNamespace + from narwhals.typing import DTypes from narwhals.utils import Implementation @@ -28,6 +29,7 @@ def __init__( output_names: list[str] | None, implementation: Implementation, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._call = call self._depth = depth @@ -37,6 +39,7 @@ def __init__( self._output_names = output_names self._implementation = implementation self._backend_version = backend_version + self._dtypes = dtypes def __repr__(self) -> str: # pragma: no cover return ( @@ -50,7 +53,9 @@ def __repr__(self) -> str: # pragma: no cover def __narwhals_namespace__(self) -> PandasLikeNamespace: from narwhals._pandas_like.namespace import PandasLikeNamespace - return PandasLikeNamespace(self._implementation, self._backend_version) + return PandasLikeNamespace( + self._implementation, self._backend_version, dtypes=self._dtypes + ) def __narwhals_expr__(self) -> None: ... @@ -60,6 +65,7 @@ def from_column_names( *column_names: str, implementation: Implementation, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> Self: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: return [ @@ -67,6 +73,7 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: df._native_frame.loc[:, column_name], implementation=df._implementation, backend_version=df._backend_version, + dtypes=df._dtypes, ) for column_name in column_names ] @@ -79,6 +86,7 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: output_names=list(column_names), implementation=implementation, backend_version=backend_version, + dtypes=dtypes, ) @classmethod @@ -87,6 +95,7 @@ def from_column_indices( *column_indices: int, implementation: Implementation, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> Self: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: return [ @@ -94,6 +103,7 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: df._native_frame.iloc[:, column_index], implementation=df._implementation, backend_version=df._backend_version, + dtypes=df._dtypes, ) for column_index in column_indices ] @@ -106,6 +116,7 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: output_names=None, implementation=implementation, backend_version=backend_version, + dtypes=dtypes, ) def cast( @@ -308,6 +319,7 @@ def alias(self, name: str) -> Self: output_names=[name], implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def over(self, keys: list[str]) -> Self: @@ -333,6 +345,7 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: output_names=self._output_names, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def is_duplicated(self) -> Self: @@ -586,6 +599,7 @@ def keep(self: Self) -> PandasLikeExpr: output_names=root_names, implementation=self._expr._implementation, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def map(self: Self, function: Callable[[str], str]) -> PandasLikeExpr: @@ -612,6 +626,7 @@ def map(self: Self, function: Callable[[str], str]) -> PandasLikeExpr: output_names=output_names, implementation=self._expr._implementation, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def prefix(self: Self, prefix: str) -> PandasLikeExpr: @@ -636,6 +651,7 @@ def prefix(self: Self, prefix: str) -> PandasLikeExpr: output_names=output_names, implementation=self._expr._implementation, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def suffix(self: Self, suffix: str) -> PandasLikeExpr: @@ -661,6 +677,7 @@ def suffix(self: Self, suffix: str) -> PandasLikeExpr: output_names=output_names, implementation=self._expr._implementation, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def to_lowercase(self: Self) -> PandasLikeExpr: @@ -686,6 +703,7 @@ def to_lowercase(self: Self) -> PandasLikeExpr: output_names=output_names, implementation=self._expr._implementation, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) def to_uppercase(self: Self) -> PandasLikeExpr: @@ -711,4 +729,5 @@ def to_uppercase(self: Self) -> PandasLikeExpr: output_names=output_names, implementation=self._expr._implementation, backend_version=self._expr._backend_version, + dtypes=self._expr._dtypes, ) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 55c038f9d..f20383460 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -92,6 +92,7 @@ def _from_native_frame(self, df: PandasLikeDataFrame) -> PandasLikeDataFrame: df, implementation=self._df._implementation, backend_version=self._df._backend_version, + dtypes=self._df._dtypes, ) def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]: diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 7356524d3..6aacf2856 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -8,7 +8,6 @@ from typing import Literal from typing import cast -from narwhals import dtypes from narwhals._expression_parsing import combine_root_names from narwhals._expression_parsing import parse_into_exprs from narwhals._expression_parsing import reduce_output_names @@ -22,45 +21,30 @@ if TYPE_CHECKING: from narwhals._pandas_like.typing import IntoPandasLikeExpr + from narwhals.dtypes import DType + from narwhals.typing import DTypes from narwhals.utils import Implementation class PandasLikeNamespace: - Int64 = dtypes.Int64 - Int32 = dtypes.Int32 - Int16 = dtypes.Int16 - Int8 = dtypes.Int8 - UInt64 = dtypes.UInt64 - UInt32 = dtypes.UInt32 - UInt16 = dtypes.UInt16 - UInt8 = dtypes.UInt8 - Float64 = dtypes.Float64 - Float32 = dtypes.Float32 - Boolean = dtypes.Boolean - Object = dtypes.Object - Unknown = dtypes.Unknown - Categorical = dtypes.Categorical - Enum = dtypes.Enum - String = dtypes.String - Datetime = dtypes.Datetime - Duration = dtypes.Duration - Date = dtypes.Date - List = dtypes.List - Struct = dtypes.Struct - Array = dtypes.Array - @property def selectors(self) -> PandasSelectorNamespace: return PandasSelectorNamespace( - implementation=self._implementation, backend_version=self._backend_version + implementation=self._implementation, + backend_version=self._backend_version, + dtypes=self._dtypes, ) # --- not in spec --- def __init__( - self, implementation: Implementation, backend_version: tuple[int, ...] + self, + implementation: Implementation, + backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._implementation = implementation self._backend_version = backend_version + self._dtypes = dtypes def _create_expr_from_callable( self, @@ -79,6 +63,7 @@ def _create_expr_from_callable( output_names=output_names, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def _create_series_from_scalar( @@ -90,6 +75,7 @@ def _create_series_from_scalar( index=series._native_series.index[0:1], implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def _create_expr_from_series(self, series: PandasLikeSeries) -> PandasLikeExpr: @@ -101,6 +87,7 @@ def _create_expr_from_series(self, series: PandasLikeSeries) -> PandasLikeExpr: output_names=None, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def _create_compliant_series(self, value: Any) -> PandasLikeSeries: @@ -108,6 +95,7 @@ def _create_compliant_series(self, value: Any) -> PandasLikeSeries: value, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) # --- selection --- @@ -116,6 +104,7 @@ def col(self, *column_names: str) -> PandasLikeExpr: *column_names, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def nth(self, *column_indices: int) -> PandasLikeExpr: @@ -123,6 +112,7 @@ def nth(self, *column_indices: int) -> PandasLikeExpr: *column_indices, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def all(self) -> PandasLikeExpr: @@ -132,6 +122,7 @@ def all(self) -> PandasLikeExpr: df._native_frame.loc[:, column_name], implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) for column_name in df.columns ], @@ -141,9 +132,10 @@ def all(self) -> PandasLikeExpr: output_names=None, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) - def lit(self, value: Any, dtype: dtypes.DType | None) -> PandasLikeExpr: + def lit(self, value: Any, dtype: DType | None) -> PandasLikeExpr: def _lit_pandas_series(df: PandasLikeDataFrame) -> PandasLikeSeries: pandas_series = PandasLikeSeries._from_iterable( data=[value], @@ -151,6 +143,7 @@ def _lit_pandas_series(df: PandasLikeDataFrame) -> PandasLikeSeries: index=df._native_frame.index[0:1], implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) if dtype: return pandas_series.cast(dtype) @@ -164,6 +157,7 @@ def _lit_pandas_series(df: PandasLikeDataFrame) -> PandasLikeSeries: output_names=["lit"], implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) # --- reduction --- @@ -172,6 +166,7 @@ def sum(self, *column_names: str) -> PandasLikeExpr: *column_names, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ).sum() def mean(self, *column_names: str) -> PandasLikeExpr: @@ -179,6 +174,7 @@ def mean(self, *column_names: str) -> PandasLikeExpr: *column_names, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ).mean() def max(self, *column_names: str) -> PandasLikeExpr: @@ -186,6 +182,7 @@ def max(self, *column_names: str) -> PandasLikeExpr: *column_names, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ).max() def min(self, *column_names: str) -> PandasLikeExpr: @@ -193,6 +190,7 @@ def min(self, *column_names: str) -> PandasLikeExpr: *column_names, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ).min() def len(self) -> PandasLikeExpr: @@ -204,6 +202,7 @@ def len(self) -> PandasLikeExpr: index=[0], implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) ], depth=0, @@ -212,6 +211,7 @@ def len(self) -> PandasLikeExpr: output_names=["len"], implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) # --- horizontal --- @@ -284,6 +284,7 @@ def concat( ), implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) if how == "vertical": return PandasLikeDataFrame( @@ -294,6 +295,7 @@ def concat( ), implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) raise NotImplementedError @@ -301,14 +303,18 @@ def when( self, *predicates: IntoPandasLikeExpr, ) -> PandasWhen: - plx = self.__class__(self._implementation, self._backend_version) + plx = self.__class__( + self._implementation, self._backend_version, dtypes=self._dtypes + ) if predicates: condition = plx.all_horizontal(*predicates) else: msg = "at least one predicate needs to be provided" raise TypeError(msg) - return PandasWhen(condition, self._implementation, self._backend_version) + return PandasWhen( + condition, self._implementation, self._backend_version, dtypes=self._dtypes + ) class PandasWhen: @@ -319,12 +325,15 @@ def __init__( backend_version: tuple[int, ...], then_value: Any = None, otherwise_value: Any = None, + *, + dtypes: DTypes, ) -> None: self._implementation = implementation self._backend_version = backend_version self._condition = condition self._then_value = then_value self._otherwise_value = otherwise_value + self._dtypes = dtypes def __call__(self, df: PandasLikeDataFrame) -> list[PandasLikeSeries]: from narwhals._expression_parsing import parse_into_expr @@ -332,7 +341,9 @@ def __call__(self, df: PandasLikeDataFrame) -> list[PandasLikeSeries]: from narwhals._pandas_like.utils import validate_column_comparand plx = PandasLikeNamespace( - implementation=self._implementation, backend_version=self._backend_version + implementation=self._implementation, + backend_version=self._backend_version, + dtypes=self._dtypes, ) condition = parse_into_expr(self._condition, namespace=plx)._call(df)[0] # type: ignore[arg-type] @@ -346,6 +357,7 @@ def __call__(self, df: PandasLikeDataFrame) -> list[PandasLikeSeries]: index=condition._native_series.index, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) value_series = cast(PandasLikeSeries, value_series) @@ -383,6 +395,7 @@ def then(self, value: PandasLikeExpr | PandasLikeSeries | Any) -> PandasThen: output_names=None, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) @@ -397,10 +410,11 @@ def __init__( output_names: list[str] | None, implementation: Implementation, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._implementation = implementation self._backend_version = backend_version - + self._dtypes = dtypes self._call = call self._depth = depth self._function_name = function_name diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 1214e12fc..74235afa5 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -4,22 +4,27 @@ from typing import Any from typing import NoReturn -from narwhals import dtypes from narwhals._pandas_like.expr import PandasLikeExpr if TYPE_CHECKING: from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.series import PandasLikeSeries from narwhals.dtypes import DType + from narwhals.typing import DTypes from narwhals.utils import Implementation class PandasSelectorNamespace: def __init__( - self, *, implementation: Implementation, backend_version: tuple[int, ...] + self, + *, + implementation: Implementation, + backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._implementation = implementation self._backend_version = backend_version + self._dtypes = dtypes def by_dtype(self, dtypes: list[DType | type[DType]]) -> PandasSelector: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: @@ -33,32 +38,33 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: output_names=None, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def numeric(self) -> PandasSelector: return self.by_dtype( [ - dtypes.Int64, - dtypes.Int32, - dtypes.Int16, - dtypes.Int8, - dtypes.UInt64, - dtypes.UInt32, - dtypes.UInt16, - dtypes.UInt8, - dtypes.Float64, - dtypes.Float32, + self._dtypes.Int64, + self._dtypes.Int32, + self._dtypes.Int16, + self._dtypes.Int8, + self._dtypes.UInt64, + self._dtypes.UInt32, + self._dtypes.UInt16, + self._dtypes.UInt8, + self._dtypes.Float64, + self._dtypes.Float32, ], ) def categorical(self) -> PandasSelector: - return self.by_dtype([dtypes.Categorical]) + return self.by_dtype([self._dtypes.Categorical]) def string(self) -> PandasSelector: - return self.by_dtype([dtypes.String]) + return self.by_dtype([self._dtypes.String]) def boolean(self) -> PandasSelector: - return self.by_dtype([dtypes.Boolean]) + return self.by_dtype([self._dtypes.Boolean]) def all(self) -> PandasSelector: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: @@ -72,6 +78,7 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: output_names=None, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) @@ -94,6 +101,7 @@ def _to_expr(self) -> PandasLikeExpr: output_names=self._output_names, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def __sub__(self, other: PandasSelector | Any) -> PandasSelector | Any: @@ -112,6 +120,7 @@ def call(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: output_names=None, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) else: return self._to_expr() - other @@ -132,6 +141,7 @@ def call(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: output_names=None, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) else: return self._to_expr() | other @@ -152,6 +162,7 @@ def call(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: output_names=None, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) else: return self._to_expr() & other @@ -159,7 +170,9 @@ def call(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: def __invert__(self) -> PandasSelector: return ( PandasSelectorNamespace( - implementation=self._implementation, backend_version=self._backend_version + implementation=self._implementation, + backend_version=self._backend_version, + dtypes=self._dtypes, ).all() - self ) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 8557f8eee..dc9a00009 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -24,6 +24,7 @@ from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals.dtypes import DType + from narwhals.typing import DTypes PANDAS_TO_NUMPY_DTYPE_NO_MISSING = { "Int64": "int64", @@ -78,11 +79,13 @@ def __init__( *, implementation: Implementation, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> None: self._name = native_series.name self._native_series = native_series self._implementation = implementation self._backend_version = backend_version + self._dtypes = dtypes # In pandas, copy-on-write becomes the default in version 3. # So, before that, we need to explicitly avoid unnecessary @@ -131,6 +134,7 @@ def _from_native_series(self, series: Any) -> Self: series, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) @classmethod @@ -142,6 +146,7 @@ def _from_iterable( *, implementation: Implementation, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> Self: return cls( native_series_from_iterable( @@ -152,6 +157,7 @@ def _from_iterable( ), implementation=implementation, backend_version=backend_version, + dtypes=dtypes, ) def __len__(self) -> int: @@ -167,7 +173,7 @@ def shape(self) -> tuple[int]: @property def dtype(self: Self) -> DType: - return native_to_narwhals_dtype(self._native_series) + return native_to_narwhals_dtype(self._native_series, self._dtypes) def scatter(self, indices: int | Sequence[int], values: Any) -> Self: if isinstance(values, self.__class__): @@ -190,7 +196,9 @@ def cast( dtype: Any, ) -> Self: ser = self._native_series - dtype = narwhals_to_native_dtype(dtype, ser.dtype, self._implementation) + dtype = narwhals_to_native_dtype( + dtype, ser.dtype, self._implementation, self._dtypes + ) return self._from_native_series(ser.astype(dtype)) def item(self: Self, index: int | None = None) -> Any: @@ -212,6 +220,7 @@ def to_frame(self) -> PandasLikeDataFrame: self._native_series.to_frame(), implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def to_list(self) -> Any: @@ -598,6 +607,7 @@ def value_counts( val_count, implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def quantile( @@ -640,6 +650,7 @@ def to_dummies( ).astype(int), implementation=self._implementation, backend_version=self._backend_version, + dtypes=self._dtypes, ) def gather_every(self: Self, n: int, offset: int = 0) -> Self: diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 286d712bf..726a07c56 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -14,6 +14,7 @@ from narwhals._pandas_like.expr import PandasLikeExpr from narwhals._pandas_like.series import PandasLikeSeries from narwhals.dtypes import DType + from narwhals.typing import DTypes ExprT = TypeVar("ExprT", bound=PandasLikeExpr) import pandas as pd @@ -94,6 +95,7 @@ def create_native_series( *, implementation: Implementation, backend_version: tuple[int, ...], + dtypes: DTypes, ) -> PandasLikeSeries: from narwhals._pandas_like.series import PandasLikeSeries @@ -102,7 +104,10 @@ def create_native_series( iterable, index=index, name="" ) return PandasLikeSeries( - series, implementation=implementation, backend_version=backend_version + series, + implementation=implementation, + backend_version=backend_version, + dtypes=dtypes, ) else: # pragma: no cover msg = f"Expected pandas-like implementation ({PANDAS_LIKE_IMPLEMENTATION}), found {implementation}" @@ -206,9 +211,7 @@ def set_axis( return obj.set_axis(index, axis=0, **kwargs) # type: ignore[attr-defined, no-any-return] -def native_to_narwhals_dtype(column: Any) -> DType: - from narwhals import dtypes - +def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType: dtype = str(column.dtype) if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: return dtypes.Int64() @@ -280,7 +283,7 @@ def native_to_narwhals_dtype(column: Any) -> DType: try: return map_interchange_dtype_to_narwhals_dtype( - df.__dataframe__().get_column(0).dtype + df.__dataframe__().get_column(0).dtype, dtypes ) except Exception: # noqa: BLE001 return dtypes.Object() @@ -308,10 +311,11 @@ def get_dtype_backend(dtype: Any, implementation: Implementation) -> str: def narwhals_to_native_dtype( # noqa: PLR0915 - dtype: DType | type[DType], starting_dtype: Any, implementation: Implementation + dtype: DType | type[DType], + starting_dtype: Any, + implementation: Implementation, + dtypes: DTypes, ) -> Any: - from narwhals import dtypes - if "polars" in str(type(dtype)): msg = ( f"Expected Narwhals object, got: {type(dtype)}.\n\n" diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index acf70778b..a4e30ec63 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -17,12 +17,17 @@ import numpy as np from typing_extensions import Self + from narwhals.typing import DTypes + class PolarsDataFrame: - def __init__(self, df: Any, *, backend_version: tuple[int, ...]) -> None: + def __init__( + self, df: Any, *, backend_version: tuple[int, ...], dtypes: DTypes + ) -> None: self._native_frame = df self._backend_version = backend_version self._implementation = Implementation.POLARS + self._dtypes = dtypes def __repr__(self) -> str: # pragma: no cover return "PolarsDataFrame" @@ -31,7 +36,7 @@ def __narwhals_dataframe__(self) -> Self: return self def __narwhals_namespace__(self) -> PolarsNamespace: - return PolarsNamespace(backend_version=self._backend_version) + return PolarsNamespace(backend_version=self._backend_version, dtypes=self._dtypes) def __native_namespace__(self: Self) -> ModuleType: if self._implementation is Implementation.POLARS: @@ -41,7 +46,9 @@ def __native_namespace__(self: Self) -> ModuleType: raise AssertionError(msg) def _from_native_frame(self, df: Any) -> Self: - return self.__class__(df, backend_version=self._backend_version) + return self.__class__( + df, backend_version=self._backend_version, dtypes=self._dtypes + ) def _from_native_object(self, obj: Any) -> Any: import polars as pl # ignore-banned-import() @@ -49,7 +56,9 @@ def _from_native_object(self, obj: Any) -> Any: if isinstance(obj, pl.Series): from narwhals._polars.series import PolarsSeries - return PolarsSeries(obj, backend_version=self._backend_version) + return PolarsSeries( + obj, backend_version=self._backend_version, dtypes=self._dtypes + ) if isinstance(obj, pl.DataFrame): return self._from_native_frame(obj) # scalar @@ -78,14 +87,20 @@ def __array__(self, dtype: Any | None = None, copy: bool | None = None) -> np.nd @property def schema(self) -> dict[str, Any]: schema = self._native_frame.schema - return {name: native_to_narwhals_dtype(dtype) for name, dtype in schema.items()} + return { + name: native_to_narwhals_dtype(dtype, self._dtypes) + for name, dtype in schema.items() + } def collect_schema(self) -> dict[str, Any]: if self._backend_version < (1,): # pragma: no cover schema = self._native_frame.schema else: schema = dict(self._native_frame.collect_schema()) - return {name: native_to_narwhals_dtype(dtype) for name, dtype in schema.items()} + return { + name: native_to_narwhals_dtype(dtype, self._dtypes) + for name, dtype in schema.items() + } @property def shape(self) -> tuple[int, int]: @@ -140,14 +155,18 @@ def __getitem__(self, item: Any) -> Any: if isinstance(result, pl.Series): from narwhals._polars.series import PolarsSeries - return PolarsSeries(result, backend_version=self._backend_version) + return PolarsSeries( + result, backend_version=self._backend_version, dtypes=self._dtypes + ) return self._from_native_object(result) def get_column(self, name: str) -> Any: from narwhals._polars.series import PolarsSeries return PolarsSeries( - self._native_frame.get_column(name), backend_version=self._backend_version + self._native_frame.get_column(name), + backend_version=self._backend_version, + dtypes=self._dtypes, ) def is_empty(self) -> bool: @@ -159,7 +178,9 @@ def columns(self) -> list[str]: def lazy(self) -> PolarsLazyFrame: return PolarsLazyFrame( - self._native_frame.lazy(), backend_version=self._backend_version + self._native_frame.lazy(), + backend_version=self._backend_version, + dtypes=self._dtypes, ) def to_dict(self, *, as_series: bool) -> Any: @@ -169,7 +190,9 @@ def to_dict(self, *, as_series: bool) -> Any: from narwhals._polars.series import PolarsSeries return { - name: PolarsSeries(col, backend_version=self._backend_version) + name: PolarsSeries( + col, backend_version=self._backend_version, dtypes=self._dtypes + ) for name, col in df.to_dict(as_series=True).items() } else: @@ -217,10 +240,13 @@ def unpivot( class PolarsLazyFrame: - def __init__(self, df: Any, *, backend_version: tuple[int, ...]) -> None: + def __init__( + self, df: Any, *, backend_version: tuple[int, ...], dtypes: DTypes + ) -> None: self._native_frame = df self._backend_version = backend_version self._implementation = Implementation.POLARS + self._dtypes = dtypes def __repr__(self) -> str: # pragma: no cover return "PolarsLazyFrame" @@ -229,7 +255,7 @@ def __narwhals_lazyframe__(self) -> Self: return self def __narwhals_namespace__(self) -> PolarsNamespace: - return PolarsNamespace(backend_version=self._backend_version) + return PolarsNamespace(backend_version=self._backend_version, dtypes=self._dtypes) def __native_namespace__(self: Self) -> ModuleType: if self._implementation is Implementation.POLARS: @@ -239,7 +265,9 @@ def __native_namespace__(self: Self) -> ModuleType: raise AssertionError(msg) def _from_native_frame(self, df: Any) -> Self: - return self.__class__(df, backend_version=self._backend_version) + return self.__class__( + df, backend_version=self._backend_version, dtypes=self._dtypes + ) def __getattr__(self, attr: str) -> Any: def func(*args: Any, **kwargs: Any) -> Any: @@ -257,18 +285,26 @@ def columns(self) -> list[str]: @property def schema(self) -> dict[str, Any]: schema = self._native_frame.schema - return {name: native_to_narwhals_dtype(dtype) for name, dtype in schema.items()} + return { + name: native_to_narwhals_dtype(dtype, self._dtypes) + for name, dtype in schema.items() + } def collect_schema(self) -> dict[str, Any]: if self._backend_version < (1,): # pragma: no cover schema = self._native_frame.schema else: schema = dict(self._native_frame.collect_schema()) - return {name: native_to_narwhals_dtype(dtype) for name, dtype in schema.items()} + return { + name: native_to_narwhals_dtype(dtype, self._dtypes) + for name, dtype in schema.items() + } def collect(self) -> PolarsDataFrame: return PolarsDataFrame( - self._native_frame.collect(), backend_version=self._backend_version + self._native_frame.collect(), + backend_version=self._backend_version, + dtypes=self._dtypes, ) def group_by(self, *by: str) -> Any: diff --git a/narwhals/_polars/expr.py b/narwhals/_polars/expr.py index 4f1532823..8a4c93736 100644 --- a/narwhals/_polars/expr.py +++ b/narwhals/_polars/expr.py @@ -12,18 +12,20 @@ from typing_extensions import Self from narwhals.dtypes import DType + from narwhals.typing import DTypes class PolarsExpr: - def __init__(self, expr: Any) -> None: + def __init__(self, expr: Any, dtypes: DTypes) -> None: self._native_expr = expr self._implementation = Implementation.POLARS + self._dtypes = dtypes def __repr__(self) -> str: # pragma: no cover return "PolarsExpr" def _from_native_expr(self, expr: Any) -> Self: - return self.__class__(expr) + return self.__class__(expr, dtypes=self._dtypes) def __getattr__(self, attr: str) -> Any: def func(*args: Any, **kwargs: Any) -> Any: @@ -36,7 +38,7 @@ def func(*args: Any, **kwargs: Any) -> Any: def cast(self, dtype: DType) -> Self: expr = self._native_expr - dtype = narwhals_to_native_dtype(dtype) + dtype = narwhals_to_native_dtype(dtype, self._dtypes) return self._from_native_expr(expr.cast(dtype)) def __eq__(self, other: object) -> Self: # type: ignore[override] diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index 275c104fc..21facd81f 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -7,7 +7,6 @@ from typing import Literal from typing import Sequence -from narwhals import dtypes from narwhals._expression_parsing import parse_into_exprs from narwhals._polars.utils import extract_args_kwargs from narwhals._polars.utils import narwhals_to_native_dtype @@ -18,35 +17,15 @@ from narwhals._polars.dataframe import PolarsLazyFrame from narwhals._polars.expr import PolarsExpr from narwhals._polars.typing import IntoPolarsExpr + from narwhals.dtypes import DType + from narwhals.typing import DTypes class PolarsNamespace: - Int64 = dtypes.Int64 - Int32 = dtypes.Int32 - Int16 = dtypes.Int16 - Int8 = dtypes.Int8 - UInt64 = dtypes.UInt64 - UInt32 = dtypes.UInt32 - UInt16 = dtypes.UInt16 - UInt8 = dtypes.UInt8 - Float64 = dtypes.Float64 - Float32 = dtypes.Float32 - Boolean = dtypes.Boolean - Object = dtypes.Object - Unknown = dtypes.Unknown - Categorical = dtypes.Categorical - Enum = dtypes.Enum - String = dtypes.String - Datetime = dtypes.Datetime - Duration = dtypes.Duration - Date = dtypes.Date - List = dtypes.List - Struct = dtypes.Struct - Array = dtypes.Array - - def __init__(self, *, backend_version: tuple[int, ...]) -> None: + def __init__(self, *, backend_version: tuple[int, ...], dtypes: DTypes) -> None: self._backend_version = backend_version self._implementation = Implementation.POLARS + self._dtypes = dtypes def __getattr__(self, attr: str) -> Any: import polars as pl # ignore-banned-import @@ -55,7 +34,7 @@ def __getattr__(self, attr: str) -> Any: def func(*args: Any, **kwargs: Any) -> Any: args, kwargs = extract_args_kwargs(args, kwargs) # type: ignore[assignment] - return PolarsExpr(getattr(pl, attr)(*args, **kwargs)) + return PolarsExpr(getattr(pl, attr)(*args, **kwargs), dtypes=self._dtypes) return func @@ -67,7 +46,7 @@ def nth(self, *indices: int) -> PolarsExpr: if self._backend_version < (1, 0, 0): # pragma: no cover msg = "`nth` is only supported for Polars>=1.0.0. Please use `col` for columns selection instead." raise AttributeError(msg) - return PolarsExpr(pl.nth(*indices)) + return PolarsExpr(pl.nth(*indices), dtypes=self._dtypes) def len(self) -> PolarsExpr: import polars as pl # ignore-banned-import() @@ -75,8 +54,8 @@ def len(self) -> PolarsExpr: from narwhals._polars.expr import PolarsExpr if self._backend_version < (0, 20, 5): # pragma: no cover - return PolarsExpr(pl.count().alias("len")) - return PolarsExpr(pl.len()) + return PolarsExpr(pl.count().alias("len"), dtypes=self._dtypes) + return PolarsExpr(pl.len(), dtypes=self._dtypes) def concat( self, @@ -92,17 +71,24 @@ def concat( dfs: list[Any] = [item._native_frame for item in items] result = pl.concat(dfs, how=how) if isinstance(result, pl.DataFrame): - return PolarsDataFrame(result, backend_version=items[0]._backend_version) - return PolarsLazyFrame(result, backend_version=items[0]._backend_version) + return PolarsDataFrame( + result, backend_version=items[0]._backend_version, dtypes=items[0]._dtypes + ) + return PolarsLazyFrame( + result, backend_version=items[0]._backend_version, dtypes=items[0]._dtypes + ) - def lit(self, value: Any, dtype: dtypes.DType | None = None) -> PolarsExpr: + def lit(self, value: Any, dtype: DType | None = None) -> PolarsExpr: import polars as pl # ignore-banned-import() from narwhals._polars.expr import PolarsExpr if dtype is not None: - return PolarsExpr(pl.lit(value, dtype=narwhals_to_native_dtype(dtype))) - return PolarsExpr(pl.lit(value)) + return PolarsExpr( + pl.lit(value, dtype=narwhals_to_native_dtype(dtype, self._dtypes)), + dtypes=self._dtypes, + ) + return PolarsExpr(pl.lit(value), dtypes=self._dtypes) def mean(self, *column_names: str) -> PolarsExpr: import polars as pl # ignore-banned-import() @@ -110,8 +96,8 @@ def mean(self, *column_names: str) -> PolarsExpr: from narwhals._polars.expr import PolarsExpr if self._backend_version < (0, 20, 4): # pragma: no cover - return PolarsExpr(pl.mean([*column_names])) # type: ignore[arg-type] - return PolarsExpr(pl.mean(*column_names)) + return PolarsExpr(pl.mean([*column_names]), dtypes=self._dtypes) # type: ignore[arg-type] + return PolarsExpr(pl.mean(*column_names), dtypes=self._dtypes) def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr: import polars as pl # ignore-banned-import() @@ -125,23 +111,34 @@ def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr: n_non_zero = reduce( lambda x, y: x + y, ((1 - e.is_null()) for e in polars_exprs) ) - return PolarsExpr(total._native_expr / n_non_zero._native_expr) + return PolarsExpr( + total._native_expr / n_non_zero._native_expr, dtypes=self._dtypes + ) - return PolarsExpr(pl.mean_horizontal([e._native_expr for e in polars_exprs])) + return PolarsExpr( + pl.mean_horizontal([e._native_expr for e in polars_exprs]), + dtypes=self._dtypes, + ) @property def selectors(self) -> PolarsSelectors: - return PolarsSelectors() + return PolarsSelectors(self._dtypes) class PolarsSelectors: - def by_dtype(self, dtypes: Iterable[dtypes.DType]) -> PolarsExpr: + def __init__(self, dtypes: DTypes) -> None: + self._dtypes = dtypes + + def by_dtype(self, dtypes: Iterable[DType]) -> PolarsExpr: import polars as pl # ignore-banned-import() from narwhals._polars.expr import PolarsExpr return PolarsExpr( - pl.selectors.by_dtype([narwhals_to_native_dtype(dtype) for dtype in dtypes]) + pl.selectors.by_dtype( + [narwhals_to_native_dtype(dtype, self._dtypes) for dtype in dtypes] + ), + dtypes=self._dtypes, ) def numeric(self) -> PolarsExpr: @@ -149,32 +146,32 @@ def numeric(self) -> PolarsExpr: from narwhals._polars.expr import PolarsExpr - return PolarsExpr(pl.selectors.numeric()) + return PolarsExpr(pl.selectors.numeric(), dtypes=self._dtypes) def boolean(self) -> PolarsExpr: import polars as pl # ignore-banned-import() from narwhals._polars.expr import PolarsExpr - return PolarsExpr(pl.selectors.boolean()) + return PolarsExpr(pl.selectors.boolean(), dtypes=self._dtypes) def string(self) -> PolarsExpr: import polars as pl # ignore-banned-import() from narwhals._polars.expr import PolarsExpr - return PolarsExpr(pl.selectors.string()) + return PolarsExpr(pl.selectors.string(), dtypes=self._dtypes) def categorical(self) -> PolarsExpr: import polars as pl # ignore-banned-import() from narwhals._polars.expr import PolarsExpr - return PolarsExpr(pl.selectors.categorical()) + return PolarsExpr(pl.selectors.categorical(), dtypes=self._dtypes) def all(self) -> PolarsExpr: import polars as pl # ignore-banned-import() from narwhals._polars.expr import PolarsExpr - return PolarsExpr(pl.selectors.all()) + return PolarsExpr(pl.selectors.all(), dtypes=self._dtypes) diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 7f7bf94a2..078042195 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -17,16 +17,20 @@ from narwhals._polars.dataframe import PolarsDataFrame from narwhals.dtypes import DType + from narwhals.typing import DTypes from narwhals._polars.utils import narwhals_to_native_dtype from narwhals._polars.utils import native_to_narwhals_dtype class PolarsSeries: - def __init__(self, series: Any, *, backend_version: tuple[int, ...]) -> None: + def __init__( + self, series: Any, *, backend_version: tuple[int, ...], dtypes: DTypes + ) -> None: self._native_series = series self._backend_version = backend_version self._implementation = Implementation.POLARS + self._dtypes = dtypes def __repr__(self) -> str: # pragma: no cover return "PolarsSeries" @@ -42,7 +46,9 @@ def __native_namespace__(self: Self) -> ModuleType: raise AssertionError(msg) def _from_native_series(self, series: Any) -> Self: - return self.__class__(series, backend_version=self._backend_version) + return self.__class__( + series, backend_version=self._backend_version, dtypes=self._dtypes + ) def _from_native_object(self, series: Any) -> Any: import polars as pl # ignore-banned-import() @@ -52,7 +58,9 @@ def _from_native_object(self, series: Any) -> Any: if isinstance(series, pl.DataFrame): from narwhals._polars.dataframe import PolarsDataFrame - return PolarsDataFrame(series, backend_version=self._backend_version) + return PolarsDataFrame( + series, backend_version=self._backend_version, dtypes=self._dtypes + ) # scalar return series @@ -81,7 +89,7 @@ def name(self) -> str: @property def dtype(self: Self) -> DType: - return native_to_narwhals_dtype(self._native_series.dtype) + return native_to_narwhals_dtype(self._native_series.dtype, self._dtypes) @overload def __getitem__(self, item: int) -> Any: ... @@ -94,7 +102,7 @@ def __getitem__(self, item: int | slice | Sequence[int]) -> Any | Self: def cast(self, dtype: DType) -> Self: ser = self._native_series - dtype = narwhals_to_native_dtype(dtype) + dtype = narwhals_to_native_dtype(dtype, self._dtypes) return self._from_native_series(ser.cast(dtype)) def __array__(self, dtype: Any = None, copy: bool | None = None) -> np.ndarray: @@ -184,7 +192,9 @@ def to_dummies( separator=separator, drop_first=drop_first ) - return PolarsDataFrame(result, backend_version=self._backend_version) + return PolarsDataFrame( + result, backend_version=self._backend_version, dtypes=self._dtypes + ) def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: if self._backend_version < (0, 20, 6): # pragma: no cover @@ -232,7 +242,9 @@ def value_counts( sort=sort, parallel=parallel, name=name, normalize=normalize ) - return PolarsDataFrame(result, backend_version=self._backend_version) + return PolarsDataFrame( + result, backend_version=self._backend_version, dtypes=self._dtypes + ) @property def dt(self) -> PolarsSeriesDateTimeNamespace: diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 45c464e51..db5a4a96b 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -1,8 +1,11 @@ from __future__ import annotations +from typing import TYPE_CHECKING from typing import Any -from narwhals import dtypes +if TYPE_CHECKING: + from narwhals.dtypes import DType + from narwhals.typing import DTypes def extract_native(obj: Any) -> Any: @@ -26,7 +29,7 @@ def extract_args_kwargs(args: Any, kwargs: Any) -> tuple[list[Any], dict[str, An return args, kwargs -def native_to_narwhals_dtype(dtype: Any) -> dtypes.DType: +def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: import polars as pl # ignore-banned-import() if dtype == pl.Float64: @@ -74,11 +77,9 @@ def native_to_narwhals_dtype(dtype: Any) -> dtypes.DType: return dtypes.Unknown() -def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: +def narwhals_to_native_dtype(dtype: DType | type[DType], dtypes: DTypes) -> Any: import polars as pl # ignore-banned-import() - from narwhals import dtypes - if dtype == dtypes.Float64: return pl.Float64() if dtype == dtypes.Float32: diff --git a/narwhals/functions.py b/narwhals/functions.py index 430705e66..f0bf5d4ad 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -26,6 +26,7 @@ from narwhals.dtypes import DType from narwhals.schema import Schema from narwhals.series import Series + from narwhals.typing import DTypes def concat( @@ -194,6 +195,25 @@ def new_series( 2 ] """ + from narwhals import dtypes + + return _new_series_impl( + name, + values, + dtype, + native_namespace=native_namespace, + dtypes=dtypes, # type: ignore[arg-type] + ) + + +def _new_series_impl( + name: str, + values: Any, + dtype: DType | type[DType] | None = None, + *, + native_namespace: ModuleType, + dtypes: DTypes, +) -> Series: implementation = Implementation.from_native_namespace(native_namespace) if implementation is Implementation.POLARS: @@ -202,7 +222,7 @@ def new_series( narwhals_to_native_dtype as polars_narwhals_to_native_dtype, ) - dtype = polars_narwhals_to_native_dtype(dtype) + dtype = polars_narwhals_to_native_dtype(dtype, dtypes=dtypes) native_series = native_namespace.Series(name=name, values=values, dtype=dtype) elif implementation in { @@ -215,7 +235,12 @@ def new_series( narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype, ) - dtype = pandas_like_narwhals_to_native_dtype(dtype, None, implementation) + dtype = pandas_like_narwhals_to_native_dtype( + dtype, + None, + implementation, + dtypes, + ) native_series = native_namespace.Series(values, name=name, dtype=dtype) elif implementation is Implementation.PYARROW: @@ -224,7 +249,7 @@ def new_series( narwhals_to_native_dtype as arrow_narwhals_to_native_dtype, ) - dtype = arrow_narwhals_to_native_dtype(dtype) + dtype = arrow_narwhals_to_native_dtype(dtype, dtypes=dtypes) native_series = native_namespace.chunked_array([values], type=dtype) elif implementation is Implementation.DASK: @@ -291,6 +316,23 @@ def from_dict( │ 2 ┆ 4 │ └─────┴─────┘ """ + from narwhals import dtypes + + return _from_dict_impl( + data, + schema, + native_namespace=native_namespace, + dtypes=dtypes, # type: ignore[arg-type] + ) + + +def _from_dict_impl( + data: dict[str, Any], + schema: dict[str, DType] | Schema | None = None, + *, + native_namespace: ModuleType | None = None, + dtypes: DTypes, +) -> DataFrame[Any]: from narwhals.series import Series from narwhals.translate import to_native @@ -315,7 +357,7 @@ def from_dict( ) schema = { - name: polars_narwhals_to_native_dtype(dtype) + name: polars_narwhals_to_native_dtype(dtype, dtypes=dtypes) for name, dtype in schema.items() } @@ -334,7 +376,10 @@ def from_dict( schema = { name: pandas_like_narwhals_to_native_dtype( - schema[name], native_type, implementation + schema[name], + native_type, + implementation, + dtypes, ) for name, native_type in native_frame.dtypes.items() } @@ -348,7 +393,7 @@ def from_dict( schema = native_namespace.schema( [ - (name, arrow_narwhals_to_native_dtype(dtype)) + (name, arrow_narwhals_to_native_dtype(dtype, dtypes)) for name, dtype in schema.items() ] ) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 8c4bd877c..b542b90fa 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -15,35 +15,38 @@ from narwhals import selectors from narwhals.dataframe import DataFrame as NwDataFrame from narwhals.dataframe import LazyFrame as NwLazyFrame -from narwhals.dtypes import Array -from narwhals.dtypes import Boolean -from narwhals.dtypes import Categorical -from narwhals.dtypes import Date -from narwhals.dtypes import Datetime -from narwhals.dtypes import Duration -from narwhals.dtypes import Enum -from narwhals.dtypes import Float32 -from narwhals.dtypes import Float64 -from narwhals.dtypes import Int8 -from narwhals.dtypes import Int16 -from narwhals.dtypes import Int32 -from narwhals.dtypes import Int64 -from narwhals.dtypes import List -from narwhals.dtypes import Object -from narwhals.dtypes import String -from narwhals.dtypes import Struct -from narwhals.dtypes import UInt8 -from narwhals.dtypes import UInt16 -from narwhals.dtypes import UInt32 -from narwhals.dtypes import UInt64 -from narwhals.dtypes import Unknown from narwhals.expr import Expr as NwExpr from narwhals.expr import Then as NwThen from narwhals.expr import When as NwWhen from narwhals.expr import when as nw_when +from narwhals.functions import _from_dict_impl +from narwhals.functions import _new_series_impl from narwhals.functions import show_versions from narwhals.schema import Schema as NwSchema from narwhals.series import Series as NwSeries +from narwhals.stable.v1.dtypes import Array +from narwhals.stable.v1.dtypes import Boolean +from narwhals.stable.v1.dtypes import Categorical +from narwhals.stable.v1.dtypes import Date +from narwhals.stable.v1.dtypes import Datetime +from narwhals.stable.v1.dtypes import Duration +from narwhals.stable.v1.dtypes import Enum +from narwhals.stable.v1.dtypes import Float32 +from narwhals.stable.v1.dtypes import Float64 +from narwhals.stable.v1.dtypes import Int8 +from narwhals.stable.v1.dtypes import Int16 +from narwhals.stable.v1.dtypes import Int32 +from narwhals.stable.v1.dtypes import Int64 +from narwhals.stable.v1.dtypes import List +from narwhals.stable.v1.dtypes import Object +from narwhals.stable.v1.dtypes import String +from narwhals.stable.v1.dtypes import Struct +from narwhals.stable.v1.dtypes import UInt8 +from narwhals.stable.v1.dtypes import UInt16 +from narwhals.stable.v1.dtypes import UInt32 +from narwhals.stable.v1.dtypes import UInt64 +from narwhals.stable.v1.dtypes import Unknown +from narwhals.translate import _from_native_impl from narwhals.translate import get_native_namespace as nw_get_native_namespace from narwhals.translate import to_native from narwhals.typing import IntoDataFrameT @@ -811,18 +814,21 @@ def from_native( Returns: narwhals.DataFrame or narwhals.LazyFrame or narwhals.Series """ + from narwhals.stable.v1 import dtypes + # Early returns if isinstance(native_dataframe, (DataFrame, LazyFrame)) and not series_only: return native_dataframe if isinstance(native_dataframe, Series) and (series_only or allow_series): return native_dataframe - result = nw.from_native( + result = _from_native_impl( native_dataframe, strict=strict, eager_only=eager_only, eager_or_interchange_only=eager_or_interchange_only, series_only=series_only, allow_series=allow_series, + dtypes=dtypes, # type: ignore[arg-type] ) return _stableify(result) @@ -1941,8 +1947,16 @@ def new_series( 2 ] """ + from narwhals.stable.v1 import dtypes + return _stableify( - nw.new_series(name, values, dtype, native_namespace=native_namespace) + _new_series_impl( + name, + values, + dtype, + native_namespace=native_namespace, + dtypes=dtypes, # type: ignore[arg-type] + ) ) @@ -1996,8 +2010,15 @@ def from_dict( │ 2 ┆ 4 │ └─────┴─────┘ """ - return _stableify( # type: ignore[no-any-return] - nw.from_dict(data, schema=schema, native_namespace=native_namespace) + from narwhals.stable.v1 import dtypes + + return _stableify( + _from_dict_impl( + data, + schema, + native_namespace=native_namespace, + dtypes=dtypes, # type: ignore[arg-type] + ) ) diff --git a/narwhals/stable/v1/dtypes.py b/narwhals/stable/v1/dtypes.py new file mode 100644 index 000000000..942881ba4 --- /dev/null +++ b/narwhals/stable/v1/dtypes.py @@ -0,0 +1,47 @@ +from narwhals.dtypes import Array +from narwhals.dtypes import Boolean +from narwhals.dtypes import Categorical +from narwhals.dtypes import Date +from narwhals.dtypes import Datetime +from narwhals.dtypes import Duration +from narwhals.dtypes import Enum +from narwhals.dtypes import Float32 +from narwhals.dtypes import Float64 +from narwhals.dtypes import Int8 +from narwhals.dtypes import Int16 +from narwhals.dtypes import Int32 +from narwhals.dtypes import Int64 +from narwhals.dtypes import List +from narwhals.dtypes import Object +from narwhals.dtypes import String +from narwhals.dtypes import Struct +from narwhals.dtypes import UInt8 +from narwhals.dtypes import UInt16 +from narwhals.dtypes import UInt32 +from narwhals.dtypes import UInt64 +from narwhals.dtypes import Unknown + +__all__ = [ + "Array", + "Boolean", + "Categorical", + "Date", + "Datetime", + "Duration", + "Enum", + "Float32", + "Float64", + "Int8", + "Int16", + "Int32", + "Int64", + "List", + "Object", + "String", + "Struct", + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "Unknown", +] diff --git a/narwhals/translate.py b/narwhals/translate.py index 0e7706fb7..4c23f6d91 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -34,6 +34,7 @@ from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame from narwhals.series import Series + from narwhals.typing import DTypes from narwhals.typing import IntoDataFrameT from narwhals.typing import IntoFrameT @@ -296,7 +297,7 @@ def from_native( ) -> Any: ... -def from_native( # noqa: PLR0915 +def from_native( native_object: Any, *, strict: bool = True, @@ -330,6 +331,29 @@ def from_native( # noqa: PLR0915 Returns: narwhals.DataFrame or narwhals.LazyFrame or narwhals.Series """ + from narwhals import dtypes + + return _from_native_impl( + native_object, + strict=strict, + eager_only=eager_only, + eager_or_interchange_only=eager_or_interchange_only, + series_only=series_only, + allow_series=allow_series, + dtypes=dtypes, # type: ignore[arg-type] + ) + + +def _from_native_impl( # noqa: PLR0915 + native_object: Any, + *, + strict: bool = True, + eager_only: bool | None = None, + eager_or_interchange_only: bool | None = None, + series_only: bool | None = None, + allow_series: bool | None = None, + dtypes: DTypes, +) -> Any: from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.series import ArrowSeries from narwhals._dask.dataframe import DaskLazyFrame @@ -398,7 +422,11 @@ def from_native( # noqa: PLR0915 raise TypeError(msg) pl = get_polars() return DataFrame( - PolarsDataFrame(native_object, backend_version=parse_version(pl.__version__)), + PolarsDataFrame( + native_object, + backend_version=parse_version(pl.__version__), + dtypes=dtypes, + ), level="full", ) elif is_polars_lazyframe(native_object): @@ -410,7 +438,11 @@ def from_native( # noqa: PLR0915 raise TypeError(msg) pl = get_polars() return LazyFrame( - PolarsLazyFrame(native_object, backend_version=parse_version(pl.__version__)), + PolarsLazyFrame( + native_object, + backend_version=parse_version(pl.__version__), + dtypes=dtypes, + ), level="full", ) elif is_polars_series(native_object): @@ -419,7 +451,11 @@ def from_native( # noqa: PLR0915 msg = "Please set `allow_series=True`" raise TypeError(msg) return Series( - PolarsSeries(native_object, backend_version=parse_version(pl.__version__)), + PolarsSeries( + native_object, + backend_version=parse_version(pl.__version__), + dtypes=dtypes, + ), level="full", ) @@ -434,6 +470,7 @@ def from_native( # noqa: PLR0915 native_object, backend_version=parse_version(pd.__version__), implementation=Implementation.PANDAS, + dtypes=dtypes, ), level="full", ) @@ -447,6 +484,7 @@ def from_native( # noqa: PLR0915 native_object, implementation=Implementation.PANDAS, backend_version=parse_version(pd.__version__), + dtypes=dtypes, ), level="full", ) @@ -462,6 +500,7 @@ def from_native( # noqa: PLR0915 native_object, implementation=Implementation.MODIN, backend_version=parse_version(mpd.__version__), + dtypes=dtypes, ), level="full", ) @@ -475,6 +514,7 @@ def from_native( # noqa: PLR0915 native_object, implementation=Implementation.MODIN, backend_version=parse_version(mpd.__version__), + dtypes=dtypes, ), level="full", ) @@ -490,6 +530,7 @@ def from_native( # noqa: PLR0915 native_object, implementation=Implementation.CUDF, backend_version=parse_version(cudf.__version__), + dtypes=dtypes, ), level="full", ) @@ -503,6 +544,7 @@ def from_native( # noqa: PLR0915 native_object, implementation=Implementation.CUDF, backend_version=parse_version(cudf.__version__), + dtypes=dtypes, ), level="full", ) @@ -514,7 +556,11 @@ def from_native( # noqa: PLR0915 msg = "Cannot only use `series_only` with arrow table" raise TypeError(msg) return DataFrame( - ArrowDataFrame(native_object, backend_version=parse_version(pa.__version__)), + ArrowDataFrame( + native_object, + backend_version=parse_version(pa.__version__), + dtypes=dtypes, + ), level="full", ) elif is_pyarrow_chunked_array(native_object): @@ -524,7 +570,10 @@ def from_native( # noqa: PLR0915 raise TypeError(msg) return Series( ArrowSeries( - native_object, backend_version=parse_version(pa.__version__), name="" + native_object, + backend_version=parse_version(pa.__version__), + name="", + dtypes=dtypes, ), level="full", ) @@ -542,7 +591,9 @@ def from_native( # noqa: PLR0915 raise ImportError(msg) return LazyFrame( DaskLazyFrame( - native_object, backend_version=parse_version(get_dask().__version__) + native_object, + backend_version=parse_version(get_dask().__version__), + dtypes=dtypes, ), level="full", ) @@ -556,7 +607,7 @@ def from_native( # noqa: PLR0915 ) raise TypeError(msg) return DataFrame( - DuckDBInterchangeFrame(native_object), + DuckDBInterchangeFrame(native_object, dtypes=dtypes), level="interchange", ) @@ -569,7 +620,7 @@ def from_native( # noqa: PLR0915 ) raise TypeError(msg) return DataFrame( - IbisInterchangeFrame(native_object), + IbisInterchangeFrame(native_object, dtypes=dtypes), level="interchange", ) @@ -582,7 +633,7 @@ def from_native( # noqa: PLR0915 ) raise TypeError(msg) return DataFrame( - InterchangeFrame(native_object), + InterchangeFrame(native_object, dtypes=dtypes), level="interchange", ) diff --git a/narwhals/typing.py b/narwhals/typing.py index ecc89a4b2..62a7ca58c 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -14,6 +14,7 @@ else: from typing_extensions import TypeAlias + from narwhals import dtypes from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame from narwhals.expr import Expr @@ -52,6 +53,32 @@ def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ... FrameT = TypeVar("FrameT", "DataFrame[Any]", "LazyFrame[Any]") DataFrameT = TypeVar("DataFrameT", bound="DataFrame[Any]") + +class DTypes: + Int64: type[dtypes.Int64] + Int32: type[dtypes.Int32] + Int16: type[dtypes.Int16] + Int8: type[dtypes.Int8] + UInt64: type[dtypes.UInt64] + UInt32: type[dtypes.UInt32] + UInt16: type[dtypes.UInt16] + UInt8: type[dtypes.UInt8] + Float64: type[dtypes.Float64] + Float32: type[dtypes.Float32] + String: type[dtypes.String] + Boolean: type[dtypes.Boolean] + Object: type[dtypes.Object] + Categorical: type[dtypes.Categorical] + Enum: type[dtypes.Enum] + Datetime: type[dtypes.Datetime] + Duration: type[dtypes.Duration] + Date: type[dtypes.Date] + Struct: type[dtypes.Struct] + List: type[dtypes.List] + Array: type[dtypes.Array] + Unknown: type[dtypes.Unknown] + + __all__ = [ "IntoExpr", "IntoDataFrame", diff --git a/narwhals/utils.py b/narwhals/utils.py index 0d9503240..62ae7730b 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -11,7 +11,6 @@ from typing import TypeVar from typing import cast -from narwhals import dtypes from narwhals._exceptions import ColumnNotFoundError from narwhals.dependencies import get_cudf from narwhals.dependencies import get_dask_dataframe @@ -393,6 +392,8 @@ def is_ordered_categorical(series: Series) -> bool: """ from narwhals._interchange.series import InterchangeSeries + dtypes = series._compliant_series._dtypes + if ( isinstance(series._compliant_series, InterchangeSeries) and series.dtype == dtypes.Categorical diff --git a/tests/from_dict_test.py b/tests/from_dict_test.py index a1332908a..4583b03e5 100644 --- a/tests/from_dict_test.py +++ b/tests/from_dict_test.py @@ -1,6 +1,7 @@ import pytest -import narwhals.stable.v1 as nw +import narwhals as nw +import narwhals.stable.v1 as nw_v1 from tests.utils import Constructor from tests.utils import compare_dicts @@ -21,10 +22,10 @@ def test_from_dict_schema( ) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) - schema = {"c": nw.Int16(), "d": nw.Float32()} - df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) - native_namespace = nw.get_native_namespace(df) - result = nw.from_dict( + schema = {"c": nw_v1.Int16(), "d": nw_v1.Float32()} + df = nw_v1.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) + native_namespace = nw_v1.get_native_namespace(df) + result = nw_v1.from_dict( {"c": [1, 2], "d": [5, 6]}, native_namespace=native_namespace, schema=schema, # type: ignore[arg-type] @@ -55,6 +56,17 @@ def test_from_dict_one_native_one_narwhals( compare_dicts(result, expected) +def test_from_dict_v1(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "dask" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) + native_namespace = nw.get_native_namespace(df) + result = nw.from_dict({"c": [1, 2], "d": [5, 6]}, native_namespace=native_namespace) + expected = {"c": [1, 2], "d": [5, 6]} + compare_dicts(result, expected) + assert isinstance(result, nw.DataFrame) + + def test_from_dict_empty() -> None: with pytest.raises(ValueError, match="empty"): nw.from_dict({}) diff --git a/tests/new_series_test.py b/tests/new_series_test.py index 8ddcabd40..fad4a7536 100644 --- a/tests/new_series_test.py +++ b/tests/new_series_test.py @@ -3,7 +3,8 @@ import pandas as pd import pytest -import narwhals.stable.v1 as nw +import narwhals as nw +import narwhals.stable.v1 as nw_v1 from tests.utils import compare_dicts @@ -24,6 +25,25 @@ def test_new_series(constructor_eager: Any) -> None: compare_dicts(result.to_frame(), expected) +def test_new_series_v1(constructor_eager: Any) -> None: + s = nw_v1.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)["a"] + result = nw_v1.new_series( + "b", [4, 1, 2], native_namespace=nw_v1.get_native_namespace(s) + ) + expected = {"b": [4, 1, 2]} + # all supported libraries auto-infer this to be int64, we can always special-case + # something different if necessary + assert result.dtype == nw_v1.Int64 + compare_dicts(result.to_frame(), expected) + + result = nw_v1.new_series( + "b", [4, 1, 2], nw_v1.Int32, native_namespace=nw_v1.get_native_namespace(s) + ) + expected = {"b": [4, 1, 2]} + assert result.dtype == nw_v1.Int32 + compare_dicts(result.to_frame(), expected) + + def test_new_series_dask() -> None: pytest.importorskip("dask") pytest.importorskip("dask_expr", exc_type=ImportError) diff --git a/tests/stable_api_test.py b/tests/stable_api_test.py index d579b8185..a12b20cc6 100644 --- a/tests/stable_api_test.py +++ b/tests/stable_api_test.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import Any import polars as pl @@ -135,3 +136,10 @@ def test_series_docstrings() -> None: ) == getattr(df, item).__doc__ ) + + +def test_dtypes(constructor: Constructor) -> None: + df = nw.from_native(constructor({"a": [1], "b": [datetime(2020, 1, 1)]})) + dtype = df.collect_schema()["b"] + assert dtype in {nw.Datetime} + assert isinstance(dtype, nw.Datetime) From 93d2fc793aaaa1f2e3f641b8767d6bbae1669347 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 30 Sep 2024 19:45:16 +0200 Subject: [PATCH 066/145] feat: `Datetime(time_unit, time_zone)` and `Duration(time_unit)` types (#960) --- .github/workflows/extremes.yml | 6 +- narwhals/_arrow/utils.py | 13 ++-- narwhals/_pandas_like/series.py | 2 +- narwhals/_pandas_like/utils.py | 62 +++++++++++++++---- narwhals/_polars/utils.py | 24 +++++--- narwhals/dtypes.py | 95 ++++++++++++++++++++++++++++-- narwhals/functions.py | 13 ++-- narwhals/stable/v1/dtypes.py | 39 +++++++++++- tests/dtypes_test.py | 74 +++++++++++++++++++++++ tests/expr_and_series/cast_test.py | 36 +++++++++++ tests/series_only/cast_test.py | 4 +- tests/stable_api_test.py | 12 +++- utils/check_api_reference.py | 2 +- 13 files changed, 332 insertions(+), 50 deletions(-) create mode 100644 tests/dtypes_test.py diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index f11a4f4bb..858d0b6e2 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -119,6 +119,8 @@ jobs: kaggle kernels output "marcogorelli/variable-brink-glacier" - name: install-polars run: python -m pip install *.whl + - name: install-pandas-nightly + run: pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas - name: install-reqs run: uv pip install --upgrade tox virtualenv setuptools pip -r requirements-dev.txt --system - name: uninstall pyarrow @@ -127,8 +129,8 @@ jobs: # run: uv pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --pre pyarrow --system - name: uninstall pandas run: uv pip uninstall pandas --system - - name: install-pandas-nightly - run: uv pip install --prerelease=allow --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas --system + - name: show-deps + run: uv pip freeze - name: uninstall numpy run: uv pip uninstall numpy --system - name: install numpy nightly diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index d51a4b25d..e34e949d5 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -50,9 +50,9 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: if pa.types.is_date32(dtype): return dtypes.Date() if pa.types.is_timestamp(dtype): - return dtypes.Datetime() + return dtypes.Datetime(time_unit=dtype.unit, time_zone=dtype.tz) if pa.types.is_duration(dtype): - return dtypes.Duration() + return dtypes.Duration(time_unit=dtype.unit) if pa.types.is_dictionary(dtype): return dtypes.Categorical() if pa.types.is_struct(dtype): @@ -94,11 +94,12 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], dtypes: DTypes) -> Any: if isinstance_or_issubclass(dtype, dtypes.Categorical): return pa.dictionary(pa.uint32(), pa.string()) if isinstance_or_issubclass(dtype, dtypes.Datetime): - # Use Polars' default - return pa.timestamp("us") + time_unit = getattr(dtype, "time_unit", "us") + time_zone = getattr(dtype, "time_zone", None) + return pa.timestamp(time_unit, tz=time_zone) if isinstance_or_issubclass(dtype, dtypes.Duration): - # Use Polars' default - return pa.duration("us") + time_unit = getattr(dtype, "time_unit", "us") + return pa.duration(time_unit) if isinstance_or_issubclass(dtype, dtypes.Date): return pa.date32() if isinstance_or_issubclass(dtype, dtypes.List): # pragma: no cover diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index dc9a00009..6569f8b5d 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -197,7 +197,7 @@ def cast( ) -> Self: ser = self._native_series dtype = narwhals_to_native_dtype( - dtype, ser.dtype, self._implementation, self._dtypes + dtype, ser.dtype, self._implementation, self._backend_version, self._dtypes ) return self._from_native_series(ser.astype(dtype)) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 726a07c56..92fbd5193 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -1,8 +1,10 @@ from __future__ import annotations +import re from typing import TYPE_CHECKING from typing import Any from typing import Iterable +from typing import Literal from typing import TypeVar from narwhals.utils import Implementation @@ -213,6 +215,15 @@ def set_axis( def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType: dtype = str(column.dtype) + + pd_datetime_rgx = ( + r"^datetime64\[(?Ps|ms|us|ns)(?:, (?P[a-zA-Z\/]+))?\]$" + ) + pa_datetime_rgx = r"^timestamp\[(?Ps|ms|us|ns)(?:, tz=(?P[a-zA-Z\/]+))?\]\[pyarrow\]$" + + pd_duration_rgx = r"^timedelta64\[(?Ps|ms|us|ns)\]$" + pa_duration_rgx = r"^duration\[(?Ps|ms|us|ns)\]\[pyarrow\]$" + if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: return dtypes.Int64() if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}: @@ -251,12 +262,17 @@ def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType: return dtypes.Boolean() if dtype == "category" or dtype.startswith("dictionary<"): return dtypes.Categorical() - if dtype.startswith(("datetime64", "timestamp[")): - # TODO(Unassigned): different time units and time zones - return dtypes.Datetime() - if dtype.startswith(("timedelta64", "duration")): - # TODO(Unassigned): different time units - return dtypes.Duration() + if (match_ := re.match(pd_datetime_rgx, dtype)) or ( + match_ := re.match(pa_datetime_rgx, dtype) + ): + dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] + dt_time_zone: str | None = match_.group("time_zone") + return dtypes.Datetime(dt_time_unit, dt_time_zone) + if (match_ := re.match(pd_duration_rgx, dtype)) or ( + match_ := re.match(pa_duration_rgx, dtype) + ): + du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] + return dtypes.Duration(du_time_unit) if dtype == "date32[day][pyarrow]": return dtypes.Date() if dtype.startswith(("large_list", "list")): @@ -314,6 +330,7 @@ def narwhals_to_native_dtype( # noqa: PLR0915 dtype: DType | type[DType], starting_dtype: Any, implementation: Implementation, + backend_version: tuple[int, ...], dtypes: DTypes, ) -> Any: if "polars" in str(type(dtype)): @@ -416,15 +433,34 @@ def narwhals_to_native_dtype( # noqa: PLR0915 # convert to it? return "category" if isinstance_or_issubclass(dtype, dtypes.Datetime): - # TODO(Unassigned): different time units and time zones + dt_time_unit = getattr(dtype, "time_unit", "us") + dt_time_zone = getattr(dtype, "time_zone", None) + + # Pandas does not support "ms" or "us" time units before version 2.0 + # Let's overwrite with "ns" + if implementation is Implementation.PANDAS and backend_version < ( + 2, + ): # pragma: no cover + dt_time_unit = "ns" + if dtype_backend == "pyarrow-nullable": - return "timestamp[ns][pyarrow]" - return "datetime64[ns]" + tz_part = f", tz={dt_time_zone}" if dt_time_zone else "" + return f"timestamp[{dt_time_unit}{tz_part}][pyarrow]" + else: + tz_part = f", {dt_time_zone}" if dt_time_zone else "" + return f"datetime64[{dt_time_unit}{tz_part}]" if isinstance_or_issubclass(dtype, dtypes.Duration): - # TODO(Unassigned): different time units and time zones - if dtype_backend == "pyarrow-nullable": - return "duration[ns][pyarrow]" - return "timedelta64[ns]" + du_time_unit = getattr(dtype, "time_unit", "us") + if implementation is Implementation.PANDAS and backend_version < ( + 2, + ): # pragma: no cover + dt_time_unit = "ns" + return ( + f"duration[{du_time_unit}][pyarrow]" + if dtype_backend == "pyarrow-nullable" + else f"timedelta64[{du_time_unit}]" + ) + if isinstance_or_issubclass(dtype, dtypes.Date): if dtype_backend == "pyarrow-nullable": return "date32[pyarrow]" diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index db5a4a96b..b2f060906 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING from typing import Any +from typing import Literal if TYPE_CHECKING: from narwhals.dtypes import DType @@ -62,12 +63,15 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: return dtypes.Categorical() if dtype == pl.Enum: return dtypes.Enum() - if dtype == pl.Datetime: - return dtypes.Datetime() - if dtype == pl.Duration: - return dtypes.Duration() if dtype == pl.Date: return dtypes.Date() + if dtype == pl.Datetime or isinstance(dtype, pl.Datetime): + dt_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") + dt_time_zone = getattr(dtype, "time_zone", None) + return dtypes.Datetime(time_unit=dt_time_unit, time_zone=dt_time_zone) + if dtype == pl.Duration or isinstance(dtype, pl.Duration): + du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") + return dtypes.Duration(time_unit=du_time_unit) if dtype == pl.Struct: return dtypes.Struct() if dtype == pl.List: @@ -111,12 +115,16 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], dtypes: DTypes) -> Any: if dtype == dtypes.Enum: msg = "Converting to Enum is not (yet) supported" raise NotImplementedError(msg) - if dtype == dtypes.Datetime: - return pl.Datetime() - if dtype == dtypes.Duration: - return pl.Duration() if dtype == dtypes.Date: return pl.Date() + if dtype == dtypes.Datetime or isinstance(dtype, dtypes.Datetime): + dt_time_unit = getattr(dtype, "time_unit", "us") + dt_time_zone = getattr(dtype, "time_zone", None) + return pl.Datetime(dt_time_unit, dt_time_zone) # type: ignore[arg-type] + if dtype == dtypes.Duration or isinstance(dtype, dtypes.Duration): + du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") + return pl.Duration(time_unit=du_time_unit) + if dtype == dtypes.List: # pragma: no cover msg = "Converting to List dtype is not supported yet" return NotImplementedError(msg) diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index 2d5de0f16..730f69849 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -1,6 +1,8 @@ from __future__ import annotations +from datetime import timezone from typing import TYPE_CHECKING +from typing import Literal if TYPE_CHECKING: from typing_extensions import Self @@ -71,10 +73,95 @@ class Object(DType): ... class Unknown(DType): ... -class Datetime(TemporalType): ... - - -class Duration(TemporalType): ... +class Datetime(TemporalType): + """ + Data type representing a calendar date and time of day. + + Arguments: + time_unit: Unit of time. Defaults to `'us'` (microseconds). + time_zone: Time zone string, as defined in zoneinfo (to see valid strings run + `import zoneinfo; zoneinfo.available_timezones()` for a full list). + + Notes: + Adapted from Polars implementation at: + https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L398-L457 + """ + + def __init__( + self: Self, + time_unit: Literal["us", "ns", "ms", "s"] = "us", + time_zone: str | timezone | None = None, + ) -> None: + if time_unit not in {"s", "ms", "us", "ns"}: + msg = ( + "invalid `time_unit`" + f"\n\nExpected one of {{'ns','us','ms', 's'}}, got {time_unit!r}." + ) + raise ValueError(msg) + + if isinstance(time_zone, timezone): + time_zone = str(time_zone) + + self.time_unit = time_unit + self.time_zone = time_zone + + def __eq__(self: Self, other: object) -> bool: + # allow comparing object instances to class + if type(other) is type and issubclass(other, self.__class__): + return True + elif isinstance(other, self.__class__): + return self.time_unit == other.time_unit and self.time_zone == other.time_zone + else: # pragma: no cover + return False + + def __hash__(self: Self) -> int: # pragma: no cover + return hash((self.__class__, self.time_unit, self.time_zone)) + + def __repr__(self: Self) -> str: # pragma: no cover + class_name = self.__class__.__name__ + return f"{class_name}(time_unit={self.time_unit!r}, time_zone={self.time_zone!r})" + + +class Duration(TemporalType): + """ + Data type representing a time duration. + + Arguments: + time_unit: Unit of time. Defaults to `'us'` (microseconds). + + Notes: + Adapted from Polars implementation at: + https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L460-L502 + """ + + def __init__( + self: Self, + time_unit: Literal["us", "ns", "ms", "s"] = "us", + ) -> None: + if time_unit not in ("s", "ms", "us", "ns"): + msg = ( + "invalid `time_unit`" + f"\n\nExpected one of {{'ns','us','ms', 's'}}, got {time_unit!r}." + ) + raise ValueError(msg) + + self.time_unit = time_unit + + def __eq__(self: Self, other: object) -> bool: + # allow comparing object instances to class + if type(other) is type and issubclass(other, self.__class__): + return True + elif isinstance(other, self.__class__): + return self.time_unit == other.time_unit + else: # pragma: no cover + return False + + def __hash__(self: Self) -> int: # pragma: no cover + return hash((self.__class__, self.time_unit)) + + def __repr__(self: Self) -> str: # pragma: no cover + class_name = self.__class__.__name__ + return f"{class_name}(time_unit={self.time_unit!r})" class Categorical(DType): ... diff --git a/narwhals/functions.py b/narwhals/functions.py index f0bf5d4ad..e1505e78f 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -13,6 +13,7 @@ from narwhals.dataframe import LazyFrame from narwhals.translate import from_native from narwhals.utils import Implementation +from narwhals.utils import parse_version from narwhals.utils import validate_laziness # Missing type parameters for generic type "DataFrame" @@ -235,11 +236,9 @@ def _new_series_impl( narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype, ) + backend_version = parse_version(native_namespace.__version__) dtype = pandas_like_narwhals_to_native_dtype( - dtype, - None, - implementation, - dtypes, + dtype, None, implementation, backend_version, dtypes ) native_series = native_namespace.Series(values, name=name, dtype=dtype) @@ -374,12 +373,10 @@ def _from_dict_impl( narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype, ) + backend_version = parse_version(native_namespace.__version__) schema = { name: pandas_like_narwhals_to_native_dtype( - schema[name], - native_type, - implementation, - dtypes, + schema[name], native_type, implementation, backend_version, dtypes ) for name, native_type in native_frame.dtypes.items() } diff --git a/narwhals/stable/v1/dtypes.py b/narwhals/stable/v1/dtypes.py index 942881ba4..0d1e58468 100644 --- a/narwhals/stable/v1/dtypes.py +++ b/narwhals/stable/v1/dtypes.py @@ -2,8 +2,8 @@ from narwhals.dtypes import Boolean from narwhals.dtypes import Categorical from narwhals.dtypes import Date -from narwhals.dtypes import Datetime -from narwhals.dtypes import Duration +from narwhals.dtypes import Datetime as NwDatetime +from narwhals.dtypes import Duration as NwDuration from narwhals.dtypes import Enum from narwhals.dtypes import Float32 from narwhals.dtypes import Float64 @@ -21,6 +21,41 @@ from narwhals.dtypes import UInt64 from narwhals.dtypes import Unknown + +class Datetime(NwDatetime): + """ + Data type representing a calendar date and time of day. + + Arguments: + time_unit: Unit of time. Defaults to `'us'` (microseconds). + time_zone: Time zone string, as defined in zoneinfo (to see valid strings run + `import zoneinfo; zoneinfo.available_timezones()` for a full list). + + Notes: + Adapted from Polars implementation at: + https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L398-L457 + """ + + def __hash__(self) -> int: + return hash(self.__class__) + + +class Duration(NwDuration): + """ + Data type representing a time duration. + + Arguments: + time_unit: Unit of time. Defaults to `'us'` (microseconds). + + Notes: + Adapted from Polars implementation at: + https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L460-L502 + """ + + def __hash__(self) -> int: + return hash(self.__class__) + + __all__ = [ "Array", "Boolean", diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py new file mode 100644 index 000000000..58061597f --- /dev/null +++ b/tests/dtypes_test.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from datetime import datetime +from datetime import timedelta +from datetime import timezone +from typing import Literal + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version + + +@pytest.mark.parametrize("time_unit", ["us", "ns", "ms"]) +@pytest.mark.parametrize("time_zone", ["Europe/Rome", timezone.utc, None]) +def test_datetime_valid( + time_unit: Literal["us", "ns", "ms"], time_zone: str | timezone | None +) -> None: + dtype = nw.Datetime(time_unit=time_unit, time_zone=time_zone) + + assert dtype == nw.Datetime(time_unit=time_unit, time_zone=time_zone) + assert dtype == nw.Datetime + + if time_zone: + assert dtype != nw.Datetime(time_unit=time_unit) + if time_unit != "ms": + assert dtype != nw.Datetime(time_unit="ms") + + +@pytest.mark.parametrize("time_unit", ["abc"]) +def test_datetime_invalid(time_unit: str) -> None: + with pytest.raises(ValueError, match="invalid `time_unit`"): + nw.Datetime(time_unit=time_unit) # type: ignore[arg-type] + + +@pytest.mark.parametrize("time_unit", ["us", "ns", "ms"]) +def test_duration_valid(time_unit: Literal["us", "ns", "ms"]) -> None: + dtype = nw.Duration(time_unit=time_unit) + + assert dtype == nw.Duration(time_unit=time_unit) + assert dtype == nw.Duration + + if time_unit != "ms": + assert dtype != nw.Duration(time_unit="ms") + + +@pytest.mark.parametrize("time_unit", ["abc"]) +def test_duration_invalid(time_unit: str) -> None: + with pytest.raises(ValueError, match="invalid `time_unit`"): + nw.Duration(time_unit=time_unit) # type: ignore[arg-type] + + +def test_second_tu() -> None: + s = pd.Series(np.array([np.datetime64("2020-01-01", "s")])) + result = nw.from_native(s, series_only=True) + if parse_version(pd.__version__) < (2,): # pragma: no cover + assert result.dtype == nw.Datetime("ns") + else: + assert result.dtype == nw.Datetime("s") + s = pa.chunked_array([pa.array([datetime(2020, 1, 1)], type=pa.timestamp("s"))]) + result = nw.from_native(s, series_only=True) + assert result.dtype == nw.Datetime("s") + s = pd.Series(np.array([np.timedelta64(1, "s")])) + result = nw.from_native(s, series_only=True) + if parse_version(pd.__version__) < (2,): # pragma: no cover + assert result.dtype == nw.Duration("ns") + else: + assert result.dtype == nw.Duration("s") + s = pa.chunked_array([pa.array([timedelta(1)], type=pa.duration("s"))]) + result = nw.from_native(s, series_only=True) + assert result.dtype == nw.Duration("s") diff --git a/tests/expr_and_series/cast_test.py b/tests/expr_and_series/cast_test.py index 00f242148..dafe876ab 100644 --- a/tests/expr_and_series/cast_test.py +++ b/tests/expr_and_series/cast_test.py @@ -1,3 +1,9 @@ +from __future__ import annotations + +from datetime import datetime +from datetime import timedelta +from datetime import timezone + import pandas as pd import pyarrow as pa import pytest @@ -5,6 +11,8 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version from tests.utils import Constructor +from tests.utils import compare_dicts +from tests.utils import is_windows data = { "a": [1], @@ -180,3 +188,31 @@ class Banana: with pytest.raises(AssertionError, match=r"Unknown dtype"): df.select(nw.col("a").cast(Banana)) + + +def test_cast_datetime_tz_aware( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "dask" in str(constructor) or ( + "pyarrow_table" in str(constructor) and is_windows() + ): + request.applymarker(pytest.mark.xfail) + + data = { + "date": [ + datetime(2024, 1, 1, tzinfo=timezone.utc) + timedelta(days=i) + for i in range(3) + ] + } + expected = { + "date": ["2024-01-01 01:00:00", "2024-01-02 01:00:00", "2024-01-03 01:00:00"] + } + + df = nw.from_native(constructor(data)) + result = df.select( + nw.col("date") + .cast(nw.Datetime("ms", time_zone="Europe/Rome")) + .cast(nw.String()) + .str.slice(offset=0, length=19) + ) + compare_dicts(result, expected) diff --git a/tests/series_only/cast_test.py b/tests/series_only/cast_test.py index 37ae55a01..672cbebc2 100644 --- a/tests/series_only/cast_test.py +++ b/tests/series_only/cast_test.py @@ -75,13 +75,13 @@ def test_cast_date_datetime_pandas() -> None: df = df.select(nw.col("a").cast(nw.Datetime)) result = nw.to_native(df) expected = pd.DataFrame({"a": [datetime(2020, 1, 1), datetime(2020, 1, 2)]}).astype( - {"a": "timestamp[ns][pyarrow]"} + {"a": "timestamp[us][pyarrow]"} ) pd.testing.assert_frame_equal(result, expected) # pandas: pyarrow datetime to date dfpd = pd.DataFrame({"a": [datetime(2020, 1, 1), datetime(2020, 1, 2)]}).astype( - {"a": "timestamp[ns][pyarrow]"} + {"a": "timestamp[us][pyarrow]"} ) df = nw.from_native(dfpd) df = df.select(nw.col("a").cast(nw.Date)) diff --git a/tests/stable_api_test.py b/tests/stable_api_test.py index a12b20cc6..7a67f5723 100644 --- a/tests/stable_api_test.py +++ b/tests/stable_api_test.py @@ -1,4 +1,5 @@ from datetime import datetime +from datetime import timedelta from typing import Any import polars as pl @@ -139,7 +140,12 @@ def test_series_docstrings() -> None: def test_dtypes(constructor: Constructor) -> None: - df = nw.from_native(constructor({"a": [1], "b": [datetime(2020, 1, 1)]})) + df = nw_v1.from_native( + constructor({"a": [1], "b": [datetime(2020, 1, 1)], "c": [timedelta(1)]}) + ) dtype = df.collect_schema()["b"] - assert dtype in {nw.Datetime} - assert isinstance(dtype, nw.Datetime) + assert dtype in {nw_v1.Datetime} + assert isinstance(dtype, nw_v1.Datetime) + dtype = df.collect_schema()["c"] + assert dtype in {nw_v1.Duration} + assert isinstance(dtype, nw_v1.Duration) diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 410956e04..d3f30aaa2 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -31,7 +31,7 @@ "zip_with", "__iter__", } -BASE_DTYPES = {"NumericType", "DType", "TemporalType"} +BASE_DTYPES = {"NumericType", "DType", "TemporalType", "Literal"} files = {remove_suffix(i, ".py") for i in os.listdir("narwhals")} From bb98ae6c3f0bbcc55a2a54f414c7082256201a00 Mon Sep 17 00:00:00 2001 From: Vahideh Alizadeh <82591913+V-Alizade@users.noreply.github.com> Date: Mon, 30 Sep 2024 20:43:38 +0200 Subject: [PATCH 067/145] doc: add pyarrow to DataFrame docstring (#1097) * doc: add pyarrow in dataframe docstring add pa to parquet, to NumPy ndarray and shape of DataFrame * skip doctest example to avoid file creation Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- narwhals/dataframe.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 04a483703..7def2d993 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -557,14 +557,16 @@ def write_parquet(self, file: str | Path | BytesIO) -> Any: Write dataframe to parquet file. Examples: - Construct pandas and Polars DataFrames: + Construct pandas, Polars and PyArrow DataFrames: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} >>> df_pd = pd.DataFrame(df) >>> df_pl = pl.DataFrame(df) + >>> df_pa = pa.table(df) We define a library agnostic function: @@ -572,10 +574,11 @@ def write_parquet(self, file: str | Path | BytesIO) -> Any: ... df = nw.from_native(df) ... df.write_parquet("foo.parquet") - We can then pass either pandas or Polars to `func`: + We can then pass either pandas, Polars or PyArrow to `func`: >>> func(df_pd) # doctest:+SKIP >>> func(df_pl) # doctest:+SKIP + >>> func(df_pa) # doctest:+SKIP """ self._compliant_frame.write_parquet(file) @@ -588,10 +591,12 @@ def to_numpy(self) -> np.ndarray: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> df = {"foo": [1, 2, 3], "bar": [6.5, 7.0, 8.5], "ham": ["a", "b", "c"]} >>> df_pd = pd.DataFrame(df) >>> df_pl = pl.DataFrame(df) + >>> df_pa = pa.table(df) We define a library agnostic function: @@ -599,7 +604,7 @@ def to_numpy(self) -> np.ndarray: ... def func(df): ... return df.to_numpy() - We can then pass either pandas or Polars to `func`: + We can then pass either pandas, Polars or PyArrow to `func`: >>> func(df_pd) array([[1, 6.5, 'a'], @@ -609,6 +614,10 @@ def to_numpy(self) -> np.ndarray: array([[1, 6.5, 'a'], [2, 7.0, 'b'], [3, 8.5, 'c']], dtype=object) + >>> func(df_pa) + array([[1, 6.5, 'a'], + [2, 7.0, 'b'], + [3, 8.5, 'c']], dtype=object) """ return self._compliant_frame.to_numpy() @@ -622,10 +631,12 @@ def shape(self) -> tuple[int, int]: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> df = {"foo": [1, 2, 3, 4, 5]} >>> df_pd = pd.DataFrame(df) >>> df_pl = pl.DataFrame(df) + >>> df_pa = pa.table(df) We define a library agnostic function: @@ -633,12 +644,14 @@ def shape(self) -> tuple[int, int]: ... def func(df): ... return df.shape - We can then pass either pandas or Polars to `func`: + We can then pass either pandas, Polars or PyArrow to `func`: >>> func(df_pd) (5, 1) >>> func(df_pl) (5, 1) + >>> func(df_pa) + (5, 1) """ return self._compliant_frame.shape # type: ignore[no-any-return] From 9094d5cea11e87cc5768c83bbd86aaa1eb069579 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 30 Sep 2024 20:52:39 +0200 Subject: [PATCH 068/145] fix: zip with scalar (#1099) --- narwhals/_dask/namespace.py | 2 +- narwhals/_pandas_like/series.py | 4 ++++ tests/expr_and_series/when_test.py | 14 ++------------ tests/series_only/zip_with_test.py | 10 ++++++++++ 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 01d5bea48..82fc29490 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -326,7 +326,7 @@ def __call__(self, df: DaskLazyFrame) -> list[dask_expr.Series]: # `self._otherwise_value` is a scalar and can't be converted to an expression return [value_series.where(condition, self._otherwise_value)] validate_comparand(condition, otherwise_series) - return [value_series.zip_with(condition, otherwise_series)] + return [value_series.where(condition, otherwise_series)] def then(self, value: DaskExpr | Any) -> DaskThen: self._then_value = value diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 6569f8b5d..f4bcaec07 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -620,6 +620,10 @@ def quantile( def zip_with(self: Self, mask: Any, other: Any) -> PandasLikeSeries: ser = self._native_series mask = validate_column_comparand(ser.index, mask) + if isinstance(mask, str) or not isinstance( + mask, (self.__native_namespace__().Series, Sequence) + ): + mask = [mask] other = validate_column_comparand(ser.index, other) res = ser.where(mask, other) return self._from_native_series(res) diff --git a/tests/expr_and_series/when_test.py b/tests/expr_and_series/when_test.py index bcd796a4a..993988744 100644 --- a/tests/expr_and_series/when_test.py +++ b/tests/expr_and_series/when_test.py @@ -122,12 +122,7 @@ def test_otherwise_series(constructor_eager: Any) -> None: compare_dicts(result, expected) -def test_otherwise_expression( - request: pytest.FixtureRequest, constructor: Constructor -) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) - +def test_otherwise_expression(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select( nw.when(nw.col("a") == 1).then(-1).otherwise(nw.col("a") + 7).alias("a_when") @@ -138,12 +133,7 @@ def test_otherwise_expression( compare_dicts(result, expected) -def test_when_then_otherwise_into_expr( - request: pytest.FixtureRequest, constructor: Constructor -) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) - +def test_when_then_otherwise_into_expr(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") > 1).then("c").otherwise("e")) expected = {"c": [7, 5, 6]} diff --git a/tests/series_only/zip_with_test.py b/tests/series_only/zip_with_test.py index 0c068c386..5d1461da3 100644 --- a/tests/series_only/zip_with_test.py +++ b/tests/series_only/zip_with_test.py @@ -16,3 +16,13 @@ def test_zip_with(constructor_eager: Any) -> None: result = series1.zip_with(mask, series2) expected = [1, 4, 2] compare_dicts({"a": result}, {"a": expected}) + + +def test_zip_with_length_1(constructor_eager: Any) -> None: + series1 = nw.from_native(constructor_eager({"a": [1]}), eager_only=True)["a"] + series2 = nw.from_native(constructor_eager({"a": [4]}), eager_only=True)["a"] + mask = nw.from_native(constructor_eager({"a": [False]}), eager_only=True)["a"] + + result = series1.zip_with(mask, series2) + expected = [4] + compare_dicts({"a": result}, {"a": expected}) From 0ffd2bcbe3c5c95e17a11bea46d69f788a020093 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Mon, 30 Sep 2024 19:54:34 +0100 Subject: [PATCH 069/145] feat: add `maybe_reset_index` for pandas-like dataframe or series (#1095) --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Marco Edward Gorelli --- docs/api-reference/narwhals.md | 1 + narwhals/__init__.py | 2 ++ narwhals/stable/v1/__init__.py | 31 +++++++++++++++++++++++++ narwhals/utils.py | 41 ++++++++++++++++++++++++++++++++++ tests/utils_test.py | 24 ++++++++++++++++++++ 5 files changed, 99 insertions(+) diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index d678b8732..2700c48c7 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -22,6 +22,7 @@ Here are the top-level functions available in Narwhals. - maybe_align_index - maybe_convert_dtypes - maybe_get_index + - maybe_reset_index - maybe_set_index - mean - mean_horizontal diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 2b571f0e2..e5ee71a18 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -55,6 +55,7 @@ from narwhals.utils import maybe_align_index from narwhals.utils import maybe_convert_dtypes from narwhals.utils import maybe_get_index +from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index __version__ = "1.8.4" @@ -72,6 +73,7 @@ "maybe_align_index", "maybe_convert_dtypes", "maybe_get_index", + "maybe_reset_index", "maybe_set_index", "get_native_namespace", "all", diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index b542b90fa..b0cefc3e6 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -55,6 +55,7 @@ from narwhals.utils import maybe_align_index as nw_maybe_align_index from narwhals.utils import maybe_convert_dtypes as nw_maybe_convert_dtypes from narwhals.utils import maybe_get_index as nw_maybe_get_index +from narwhals.utils import maybe_reset_index as nw_maybe_reset_index from narwhals.utils import maybe_set_index as nw_maybe_set_index if TYPE_CHECKING: @@ -1802,6 +1803,35 @@ def maybe_set_index(df: T, column_names: str | list[str]) -> T: return nw_maybe_set_index(df, column_names) +def maybe_reset_index(obj: T) -> T: + """ + Reset the index to the default integer index of a DataFrame or a Series, if it's pandas-like. + + Notes: + This is only really intended for backwards-compatibility purposes, + for example if your library already resets the index for users. + If you're designing a new library, we highly encourage you to not + rely on the Index. + For non-pandas-like inputs, this is a no-op. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals.stable.v1 as nw + >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]}, index=([6, 7])) + >>> df = nw.from_native(df_pd) + >>> nw.to_native(nw.maybe_reset_index(df)) + a b + 0 1 4 + 1 2 5 + >>> series_pd = pd.Series([1, 2]) + >>> series = nw.from_native(series_pd, series_only=True) + >>> nw.maybe_get_index(series) + RangeIndex(start=0, stop=2, step=1) + """ + return nw_maybe_reset_index(obj) + + def get_native_namespace(obj: Any) -> Any: """ Get native namespace from object. @@ -2032,6 +2062,7 @@ def from_dict( "maybe_align_index", "maybe_convert_dtypes", "maybe_get_index", + "maybe_reset_index", "maybe_set_index", "get_native_namespace", "get_level", diff --git a/narwhals/utils.py b/narwhals/utils.py index 62ae7730b..37cce17d3 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -306,6 +306,47 @@ def maybe_set_index(df: T, column_names: str | list[str]) -> T: return df_any # type: ignore[no-any-return] +def maybe_reset_index(obj: T) -> T: + """ + Reset the index to the default integer index of a DataFrame or a Series, if it's pandas-like. + + Notes: + This is only really intended for backwards-compatibility purposes, + for example if your library already resets the index for users. + If you're designing a new library, we highly encourage you to not + rely on the Index. + For non-pandas-like inputs, this is a no-op. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]}, index=([6, 7])) + >>> df = nw.from_native(df_pd) + >>> nw.to_native(nw.maybe_reset_index(df)) + a b + 0 1 4 + 1 2 5 + >>> series_pd = pd.Series([1, 2]) + >>> series = nw.from_native(series_pd, series_only=True) + >>> nw.maybe_get_index(series) + RangeIndex(start=0, stop=2, step=1) + """ + obj_any = cast(Any, obj) + native_obj = to_native(obj_any) + if is_pandas_like_dataframe(native_obj): + return obj_any._from_compliant_dataframe( # type: ignore[no-any-return] + obj_any._compliant_frame._from_native_frame(native_obj.reset_index(drop=True)) + ) + if is_pandas_like_series(native_obj): + return obj_any._from_compliant_series( # type: ignore[no-any-return] + obj_any._compliant_series._from_native_series( + native_obj.reset_index(drop=True) + ) + ) + return obj_any # type: ignore[no-any-return] + + def maybe_convert_dtypes(obj: T, *args: bool, **kwargs: bool | str) -> T: """ Convert columns or series to the best possible dtypes using dtypes supporting ``pd.NA``, if df is pandas-like. diff --git a/tests/utils_test.py b/tests/utils_test.py index f51c28eab..cea458bc9 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -84,6 +84,30 @@ def test_maybe_get_index_polars() -> None: assert result is None +def test_maybe_reset_index_pandas() -> None: + pandas_df = nw.from_native( + pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[7, 8, 9]) + ) + result = nw.maybe_reset_index(pandas_df) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 2]) + assert_frame_equal(nw.to_native(result), expected) + pandas_series = nw.from_native( + pd.Series([1, 2, 3], index=[7, 8, 9]), series_only=True + ) + result_s = nw.maybe_reset_index(pandas_series) + expected_s = pd.Series([1, 2, 3], index=[0, 1, 2]) + assert_series_equal(nw.to_native(result_s), expected_s) + + +def test_maybe_reset_index_polars() -> None: + df = nw.from_native(pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})) + result = nw.maybe_reset_index(df) + assert result is df + series = nw.from_native(pl.Series([1, 2, 3]), series_only=True) + result_s = nw.maybe_reset_index(series) + assert result_s is series + + @pytest.mark.skipif( parse_version(pd.__version__) < parse_version("1.0.0"), reason="too old for convert_dtypes", From 7fb0c5da896b0ef05c14ad562d746efb38e7ac5a Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Mon, 30 Sep 2024 16:05:01 -0400 Subject: [PATCH 070/145] test: allow cuDF series in compare_dicts (#1100) * allow cuDF series in compare_dicts * add pragma: no cover for cuDF --- tests/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index e8b453f9d..b2030744c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -34,7 +34,12 @@ def compare_dicts(result: Any, expected: dict[str, Any]) -> None: for key in result.columns: assert key in expected for key in expected: - for lhs, rhs in zip_strict(result[key], expected[key]): + result_key = result[key] + if hasattr(result_key, "_compliant_series") and "CUDF" in str( + result_key._compliant_series._implementation + ): # pragma: no cover + result_key = result_key.to_pandas() + for lhs, rhs in zip_strict(result_key, expected[key]): if hasattr(lhs, "as_py"): lhs = lhs.as_py() # noqa: PLW2901 if hasattr(rhs, "as_py"): # pragma: no cover From 9fb12be0c6d503170c96e766c44cba2d36a63de6 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 1 Oct 2024 08:39:30 +0100 Subject: [PATCH 071/145] feat: allow inspecting the inner type of List (#1104) --- narwhals/_arrow/utils.py | 2 +- narwhals/_duckdb/dataframe.py | 4 ++-- narwhals/_ibis/dataframe.py | 4 +++- narwhals/_pandas_like/utils.py | 7 ++++++- narwhals/_polars/utils.py | 2 +- narwhals/dtypes.py | 26 +++++++++++++++++++++++++- tests/dtypes_test.py | 16 +++++++++++++++- 7 files changed, 53 insertions(+), 8 deletions(-) diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index e34e949d5..bb5f3f7ff 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -58,7 +58,7 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: if pa.types.is_struct(dtype): return dtypes.Struct() if pa.types.is_list(dtype) or pa.types.is_large_list(dtype): - return dtypes.List() + return dtypes.List(native_to_narwhals_dtype(dtype.value_type, dtypes)) if pa.types.is_fixed_size_list(dtype): return dtypes.Array() return dtypes.Unknown() # pragma: no cover diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 5e8cb73d3..95567283f 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -50,8 +50,8 @@ def map_duckdb_dtype_to_narwhals_dtype(duckdb_dtype: Any, dtypes: DTypes) -> DTy return dtypes.Duration() if duckdb_dtype.startswith("STRUCT"): return dtypes.Struct() - if re.match(r"\w+\[\]", duckdb_dtype): - return dtypes.List() + if match_ := re.match(r"(.*)\[\]$", duckdb_dtype): + return dtypes.List(map_duckdb_dtype_to_narwhals_dtype(match_.group(1), dtypes)) if re.match(r"\w+\[\d+\]", duckdb_dtype): return dtypes.Array() return dtypes.Unknown() diff --git a/narwhals/_ibis/dataframe.py b/narwhals/_ibis/dataframe.py index 6f53e277d..8ee01e78b 100644 --- a/narwhals/_ibis/dataframe.py +++ b/narwhals/_ibis/dataframe.py @@ -43,7 +43,9 @@ def map_ibis_dtype_to_narwhals_dtype(ibis_dtype: Any, dtypes: DTypes) -> DType: if ibis_dtype.is_timestamp(): return dtypes.Datetime() if ibis_dtype.is_array(): - return dtypes.List() + return dtypes.List( + map_ibis_dtype_to_narwhals_dtype(ibis_dtype.value_type, dtypes) + ) if ibis_dtype.is_struct(): return dtypes.Struct() return dtypes.Unknown() # pragma: no cover diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 92fbd5193..910c19a11 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -7,6 +7,9 @@ from typing import Literal from typing import TypeVar +from narwhals._arrow.utils import ( + native_to_narwhals_dtype as arrow_native_to_narwhals_dtype, +) from narwhals.utils import Implementation from narwhals.utils import isinstance_or_issubclass @@ -276,7 +279,9 @@ def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType: if dtype == "date32[day][pyarrow]": return dtypes.Date() if dtype.startswith(("large_list", "list")): - return dtypes.List() + return dtypes.List( + arrow_native_to_narwhals_dtype(column.dtype.pyarrow_dtype.value_type, dtypes) + ) if dtype.startswith("fixed_size_list"): return dtypes.Array() if dtype.startswith("struct"): diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index b2f060906..95b98b17a 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -75,7 +75,7 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: if dtype == pl.Struct: return dtypes.Struct() if dtype == pl.List: - return dtypes.List() + return dtypes.List(native_to_narwhals_dtype(dtype.inner, dtypes)) if dtype == pl.Array: return dtypes.Array() return dtypes.Unknown() diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index 730f69849..6acfbb764 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -173,7 +173,31 @@ class Enum(DType): ... class Struct(DType): ... -class List(DType): ... +class List(DType): + def __init__(self, inner: DType | type[DType]) -> None: + self.inner = inner + + def __eq__(self, other: DType | type[DType]) -> bool: # type: ignore[override] + # This equality check allows comparison of type classes and type instances. + # If a parent type is not specific about its inner type, we infer it as equal: + # > list[i64] == list[i64] -> True + # > list[i64] == list[f32] -> False + # > list[i64] == list -> True + + # allow comparing object instances to class + if type(other) is type and issubclass(other, self.__class__): + return True + elif isinstance(other, self.__class__): + return self.inner == other.inner + else: + return False + + def __hash__(self) -> int: + return hash((self.__class__, self.inner)) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + return f"{class_name}({self.inner!r})" class Array(DType): ... diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py index 58061597f..c64dfed51 100644 --- a/tests/dtypes_test.py +++ b/tests/dtypes_test.py @@ -53,7 +53,21 @@ def test_duration_invalid(time_unit: str) -> None: nw.Duration(time_unit=time_unit) # type: ignore[arg-type] -def test_second_tu() -> None: +def test_list_valid() -> None: + dtype = nw.List(nw.Int64) + assert dtype == nw.List(nw.Int64) + assert dtype == nw.List + assert dtype != nw.List(nw.Float32) + assert dtype != nw.Duration + assert repr(dtype) == "List()" + dtype = nw.List(nw.List(nw.Int64)) + assert dtype == nw.List(nw.List(nw.Int64)) + assert dtype == nw.List + assert dtype != nw.List(nw.List(nw.Float32)) + assert dtype in {nw.List(nw.List(nw.Int64))} + + +def test_second_time_unit() -> None: s = pd.Series(np.array([np.datetime64("2020-01-01", "s")])) result = nw.from_native(s, series_only=True) if parse_version(pd.__version__) < (2,): # pragma: no cover From 18a5dce7b1f7f50a6e5b52570f615f22f1c02633 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Tue, 1 Oct 2024 03:41:40 -0400 Subject: [PATCH 072/145] update check for cuDF series (#1105) --- tests/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index b2030744c..15ce25140 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,6 +11,7 @@ import pandas as pd from narwhals.typing import IntoFrame +from narwhals.utils import Implementation if sys.version_info >= (3, 10): from typing import TypeAlias # pragma: no cover @@ -35,8 +36,9 @@ def compare_dicts(result: Any, expected: dict[str, Any]) -> None: assert key in expected for key in expected: result_key = result[key] - if hasattr(result_key, "_compliant_series") and "CUDF" in str( - result_key._compliant_series._implementation + if ( + hasattr(result_key, "_compliant_series") + and result_key._compliant_series._implementation is Implementation.CUDF ): # pragma: no cover result_key = result_key.to_pandas() for lhs, rhs in zip_strict(result_key, expected[key]): From 03ebe770fa56f45832eb98cbc8527c03ece2fd52 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Tue, 1 Oct 2024 03:42:43 -0400 Subject: [PATCH 073/145] test: xfail __iter___test for cuDF (#1106) * xfail __iter___test for cuDF * rename test --------- Co-authored-by: Marco Edward Gorelli --- tests/series_only/__iter___test.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/series_only/__iter___test.py b/tests/series_only/__iter___test.py index d190cd80a..a0a5c1189 100644 --- a/tests/series_only/__iter___test.py +++ b/tests/series_only/__iter___test.py @@ -3,13 +3,17 @@ from collections.abc import Iterable from typing import Any +import pytest + import narwhals.stable.v1 as nw from tests.utils import compare_dicts data = [1, 2, 3] -def test_to_list(constructor_eager: Any) -> None: +def test_iter(constructor_eager: Any, request: pytest.FixtureRequest) -> None: + if "cudf" in str(constructor_eager): + request.applymarker(pytest.mark.xfail) s = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] assert isinstance(s, Iterable) From 8a1241fe4a05117f67aea5e0a290604e79583953 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 1 Oct 2024 08:43:34 +0100 Subject: [PATCH 074/145] chore: Refactor validate_column_comparand (#1102) * simplify * rename argument --- narwhals/_pandas_like/series.py | 8 +++----- narwhals/_pandas_like/utils.py | 6 ++++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index f4bcaec07..74e1c492d 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -619,11 +619,9 @@ def quantile( def zip_with(self: Self, mask: Any, other: Any) -> PandasLikeSeries: ser = self._native_series - mask = validate_column_comparand(ser.index, mask) - if isinstance(mask, str) or not isinstance( - mask, (self.__native_namespace__().Series, Sequence) - ): - mask = [mask] + mask = validate_column_comparand( + ser.index, mask, treat_length_one_as_scalar=False + ) other = validate_column_comparand(ser.index, other) res = ser.where(mask, other) return self._from_native_series(res) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 910c19a11..df87e6499 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -32,7 +32,9 @@ } -def validate_column_comparand(index: Any, other: Any) -> Any: +def validate_column_comparand( + index: Any, other: Any, *, treat_length_one_as_scalar: bool = True +) -> Any: """Validate RHS of binary operation. If the comparison isn't supported, return `NotImplemented` so that the @@ -53,7 +55,7 @@ def validate_column_comparand(index: Any, other: Any) -> Any: if isinstance(other, PandasLikeDataFrame): return NotImplemented if isinstance(other, PandasLikeSeries): - if other.len() == 1: + if other.len() == 1 and treat_length_one_as_scalar: # broadcast return other.item() if other._native_series.index is not index: From 9cd953eaecd571fb99b509fe7b6a3660f5862710 Mon Sep 17 00:00:00 2001 From: Zhengbo Wang Date: Tue, 1 Oct 2024 16:19:59 +0800 Subject: [PATCH 075/145] fix: Specialize typing in `dataframe` between `Frame` and `DataFrame` (#1030) --- narwhals/dataframe.py | 6 ++++-- narwhals/typing.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 7def2d993..0ee778ab2 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -32,10 +32,12 @@ from narwhals.group_by import GroupBy from narwhals.group_by import LazyGroupBy from narwhals.series import Series + from narwhals.typing import IntoDataFrame from narwhals.typing import IntoExpr from narwhals.typing import IntoFrame FrameT = TypeVar("FrameT", bound="IntoFrame") +DataFrameT = TypeVar("DataFrameT", bound="IntoDataFrame") class BaseFrame(Generic[FrameT]): @@ -302,7 +304,7 @@ def unpivot( ) -class DataFrame(BaseFrame[FrameT]): +class DataFrame(BaseFrame[DataFrameT]): """ Narwhals DataFrame, backed by a native dataframe. @@ -424,7 +426,7 @@ def lazy(self) -> LazyFrame[Any]: """ return self._lazyframe(self._compliant_frame.lazy(), level=self._level) - def to_native(self) -> FrameT: + def to_native(self) -> DataFrameT: """ Convert Narwhals DataFrame to native one. diff --git a/narwhals/typing.py b/narwhals/typing.py index 62a7ca58c..30de0a097 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -45,12 +45,12 @@ def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ... """Anything which can be converted to a Narwhals DataFrame or LazyFrame.""" Frame: TypeAlias = Union["DataFrame[Any]", "LazyFrame[Any]"] -"""DataFrame or LazyFrame""" +"""Narwhals DataFrame or Narwhals LazyFrame""" # TypeVars for some of the above IntoFrameT = TypeVar("IntoFrameT", bound="IntoFrame") IntoDataFrameT = TypeVar("IntoDataFrameT", bound="IntoDataFrame") -FrameT = TypeVar("FrameT", "DataFrame[Any]", "LazyFrame[Any]") +FrameT = TypeVar("FrameT", bound="Frame") DataFrameT = TypeVar("DataFrameT", bound="DataFrame[Any]") From 9b4e5ab6aed16b7106a3ce78f3f03cf721203c31 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 1 Oct 2024 10:02:33 +0100 Subject: [PATCH 076/145] update docs (#1109) --- README.md | 6 ++- docs/css/mkdocstrings.css | 9 ----- docs/extending.md | 81 ++++++++++++++++++++------------------- docs/index.md | 10 +++-- mkdocs.yml | 11 +++--- 5 files changed, 59 insertions(+), 58 deletions(-) delete mode 100644 docs/css/mkdocstrings.css diff --git a/README.md b/README.md index 29623920a..847943486 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,12 @@ Extremely lightweight and extensible compatibility layer between dataframe libraries! - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow +- **Lazy-only support**: Dask - **Interchange-level support**: Ibis, Vaex, anything else which implements the DataFrame Interchange Protocol Seamlessly support all, without depending on any! -- ✅ **Just use** a subset of **the Polars API**, no need to learn anything new +- ✅ **Just use** [a subset of **the Polars API**](https://narwhals-dev.github.io/narwhals/api-reference/), no need to learn anything new - ✅ **Zero dependencies**, Narwhals only uses what the user passes in so your library can stay lightweight - ✅ Separate **lazy** and eager APIs, use **expressions** @@ -117,6 +118,9 @@ Narwhals has been featured in several talks, podcasts, and blog posts: - [Talk Python to me Podcast](https://youtu.be/FSH7BZ0tuE0) Ahoy, Narwhals are bridging the data science APIs +- [Python Bytes Podcast](https://www.youtube.com/live/N7w_ESVW40I?si=y-wN1uCsAuJOKlOT&t=382) + Ahoy, Narwhals are bridging the data science APIs + - [Super Data Science: ML & AI Podcast](https://www.youtube.com/watch?v=TeG4U8R0U8U) Narwhals: For Pandas-to-Polars DataFrame Compatibility diff --git a/docs/css/mkdocstrings.css b/docs/css/mkdocstrings.css deleted file mode 100644 index 0951698ad..000000000 --- a/docs/css/mkdocstrings.css +++ /dev/null @@ -1,9 +0,0 @@ -.md-header__topic { - font-size: 200%; - font-family: Verdana, Geneva, Tahoma, sans-serif; -} - -.md-header__button.md-logo img { - height: 22%; - width: 22%; - } \ No newline at end of file diff --git a/docs/extending.md b/docs/extending.md index f6829ba3f..22d85f701 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -2,7 +2,7 @@ ## List of supported libraries (and how to add yours!) -Currently, Narwhals supports the following libraries as inputs: +Currently, Narwhals has full API support for the following libraries: | Library | 🔗 Link 🔗 | | ------------- | ------------- | @@ -12,46 +12,13 @@ Currently, Narwhals supports the following libraries as inputs: | Modin | [github.com/modin-project/modin](https://github.com/modin-project/modin) | | PyArrow ⇶ | [arrow.apache.org/docs/python](https://arrow.apache.org/docs/python/index.html) | -If you want your own library to be recognised too, you're welcome open a PR (with tests)! -Alternatively, if you can't do that (for example, if you library is closed-source), see -the next section for what else you can do. - -To check which methods are supported for which backend in depth, please refer to the -[API completeness page](api-completeness/index.md). - -## Extending Narwhals - -We love open source, but we're not "open source absolutists". If you're unable to open -source you library, then this is how you can make your library compatible with Narwhals. - -Make sure that, in addition to the public Narwhals API, you also define: - - - `DataFrame.__narwhals_dataframe__`: return an object which implements public methods - from `Narwhals.DataFrame` - - `DataFrame.__narwhals_namespace__`: return an object which implements public top-level - functions from `narwhals` (e.g. `narwhals.col`, `narwhals.concat`, ...) - - `DataFrame.__native_namespace__`: return a native namespace object which must have a - `from_dict` method - - `LazyFrame.__narwhals_lazyframe__`: return an object which implements public methods - from `Narwhals.LazyFrame` - - `LazyFrame.__narwhals_namespace__`: return an object which implements public top-level - functions from `narwhals` (e.g. `narwhals.col`, `narwhals.concat`, ...) - - `LazyFrame.__native_namespace__`: return a native namespace object which must have a - `from_dict` method - - `Series.__narwhals_series__`: return an object which implements public methods - from `Narwhals.Series` - - If your library doesn't distinguish between lazy and eager, then it's OK for your dataframe - object to implement both `__narwhals_dataframe__` and `__narwhals_lazyframe__`. In fact, - that's currently what `narwhals._pandas_like.dataframe.PandasLikeDataFrame` does. So, if you're stuck, - take a look at the source code to see how it's done! - -Note that the "extension" mechanism is still experimental. If anything is not clear, or -doesn't work, please do raise an issue or contact us on Discord (see the link on the README). +It also has lazy-only support for [Dask](https://github.com/dask/dask), and interchange-only support +for [DuckDB](https://github.com/duckdb/duckdb) and [Ibis](https://github.com/ibis-project/ibis). -## Levels +### Levels -Narwhals comes with two levels of support: "full" and "interchange". +Narwhals comes with two levels of support ("full" and "interchange"), and we are working on defining +a "lazy-only" level too. Libraries for which we have full support can benefit from the whole [Narwhals API](https://narwhals-dev.github.io/narwhals/api-reference/). @@ -91,4 +58,38 @@ def func(df: Any) -> Schema: return df.schema ``` is also supported, meaning that, in addition to the libraries mentioned above, you can -also pass Ibis, Vaex, PyArrow, and any other library which implements the protocol. +also pass Ibis, DuckDB, Vaex, and any library which implements the protocol. + +### Extending Narwhals + +If you want your own library to be recognised too, you're welcome open a PR (with tests)!. +Alternatively, if you can't do that (for example, if you library is closed-source), see +the next section for what else you can do. + +We love open source, but we're not "open source absolutists". If you're unable to open +source you library, then this is how you can make your library compatible with Narwhals. + +Make sure that, in addition to the public Narwhals API, you also define: + + - `DataFrame.__narwhals_dataframe__`: return an object which implements public methods + from `Narwhals.DataFrame` + - `DataFrame.__narwhals_namespace__`: return an object which implements public top-level + functions from `narwhals` (e.g. `narwhals.col`, `narwhals.concat`, ...) + - `DataFrame.__native_namespace__`: return a native namespace object which must have a + `from_dict` method + - `LazyFrame.__narwhals_lazyframe__`: return an object which implements public methods + from `Narwhals.LazyFrame` + - `LazyFrame.__narwhals_namespace__`: return an object which implements public top-level + functions from `narwhals` (e.g. `narwhals.col`, `narwhals.concat`, ...) + - `LazyFrame.__native_namespace__`: return a native namespace object which must have a + `from_dict` method + - `Series.__narwhals_series__`: return an object which implements public methods + from `Narwhals.Series` + + If your library doesn't distinguish between lazy and eager, then it's OK for your dataframe + object to implement both `__narwhals_dataframe__` and `__narwhals_lazyframe__`. In fact, + that's currently what `narwhals._pandas_like.dataframe.PandasLikeDataFrame` does. So, if you're stuck, + take a look at the source code to see how it's done! + +Note that this "extension" mechanism is still experimental. If anything is not clear, or +doesn't work, please do raise an issue or contact us on Discord (see the link on the README). diff --git a/docs/index.md b/docs/index.md index 1269f70d8..f18d9af85 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,11 +2,15 @@ ![](assets/image.png) -Extremely lightweight compatibility layer between Polars, pandas, and more. +Extremely lightweight and extensible compatibility layer between dataframe libraries! -Seamlessly support both, without depending on either! +- **Full API support**: cuDF, Modin, pandas, Polars, PyArrow +- **Lazy-only support**: Dask +- **Interchange-level support**: Ibis, Vaex, anything else which implements the DataFrame Interchange Protocol -- ✅ **Just use** a subset of **the Polars API**, no need to learn anything new +Seamlessly support all, without depending on any! + +- ✅ **Just use** [a subset of **the Polars API**](https://narwhals-dev.github.io/narwhals/api-reference/), no need to learn anything new - ✅ **Zero dependencies**, Narwhals only uses what the user passes in so your library can stay lightweight - ✅ Separate **lazy** and eager APIs, use **expressions** diff --git a/mkdocs.yml b/mkdocs.yml index ccd4307ae..10c3741ed 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -19,11 +19,12 @@ nav: - extending.md - how_it_works.md - Roadmap and related projects: roadmap_and_related.md - - API Completeness: - - api-completeness/index.md - - Supported DataFrame methods: api-completeness/dataframe.md - - Supporteda Expr methods: api-completeness/expr.md - - Supported Series methods: api-completeness/series.md + # Commented-out until https://github.com/narwhals-dev/narwhals/issues/1004 is addressed + # - API Completeness: + # - api-completeness/index.md + # - Supported DataFrame methods: api-completeness/dataframe.md + # - Supported Expr methods: api-completeness/expr.md + # - Supported Series methods: api-completeness/series.md - API Reference: - api-reference/narwhals.md - api-reference/dataframe.md From 8ad8947af103006a9ef661f3165d1d8317a3659f Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 1 Oct 2024 11:10:25 +0200 Subject: [PATCH 077/145] release: Bump version to 1.9.0 (#1110) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 505bec639..0f031b398 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.8.4' +'1.9.0' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index e5ee71a18..585af0ffb 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -58,7 +58,7 @@ from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index -__version__ = "1.8.4" +__version__ = "1.9.0" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index e5fcf1abc..26cd3b8fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.8.4" +version = "1.9.0" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From cfd576815b0e9a70c264cf73eccce1014f09cad8 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Wed, 2 Oct 2024 00:00:03 -0400 Subject: [PATCH 078/145] docs: fix some docs issues (#1114) * fix import statement * fix typo in installation page --- docs/installation.md | 2 +- narwhals/stable/v1/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 0f031b398..2b725040f 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -69,4 +69,4 @@ If you run `python t.py` then your output should look like the above. This is th function - as we'll soon see, we can do much more advanced things. Let's learn about what you just did, and what Narwhals can do for you! -Note: these examples are only using pandas and Polars. Please see the following to find the [supported libriaries](extending.md). +Note: these examples are only using pandas and Polars. Please see the following to find the [supported libraries](extending.md). diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index b0cefc3e6..84b94aebf 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -346,7 +346,7 @@ def collect(self) -> DataFrame[Any]: DataFrame Examples: - >>> import narwhals as nw + >>> import narwhals.stable.v1 as nw >>> import polars as pl >>> lf_pl = pl.LazyFrame( ... { From afa73953eb4854e48ae2dcbfad07eeb30f202a9f Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Wed, 2 Oct 2024 04:19:07 -0400 Subject: [PATCH 079/145] xfail empty string in test_unpivot (#1113) --- tests/frame/unpivot_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/frame/unpivot_test.py b/tests/frame/unpivot_test.py index cccf38035..64d003bbf 100644 --- a/tests/frame/unpivot_test.py +++ b/tests/frame/unpivot_test.py @@ -51,7 +51,11 @@ def test_unpivot_var_value_names( constructor: Constructor, variable_name: str | None, value_name: str | None, + request: pytest.FixtureRequest, ) -> None: + if variable_name == "" and "cudf" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.unpivot( on=["b", "c"], index=["a"], variable_name=variable_name, value_name=value_name From 09e184f7d343c28c4f575943d8fc6505581f4b2e Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 2 Oct 2024 17:17:30 +0100 Subject: [PATCH 080/145] correct python bytes description (#1115) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 847943486..bcf314a47 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ Narwhals has been featured in several talks, podcasts, and blog posts: Ahoy, Narwhals are bridging the data science APIs - [Python Bytes Podcast](https://www.youtube.com/live/N7w_ESVW40I?si=y-wN1uCsAuJOKlOT&t=382) - Ahoy, Narwhals are bridging the data science APIs + Episode 402, topic #2 - [Super Data Science: ML & AI Podcast](https://www.youtube.com/watch?v=TeG4U8R0U8U) Narwhals: For Pandas-to-Polars DataFrame Compatibility From 3f8ba384d9fbed88a10689d06456b96d402a3637 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 3 Oct 2024 17:45:02 +0100 Subject: [PATCH 081/145] fix: Fix `DataFrame.__getitem__` when slicing with tuple and null slice (#1123) * fix: Fix `DataFrame.__getitem__` when slicing with tuple and null slice * old polars compat --- narwhals/_arrow/dataframe.py | 4 ++++ narwhals/_pandas_like/dataframe.py | 2 ++ narwhals/_polars/dataframe.py | 7 +++++++ narwhals/dataframe.py | 2 -- tests/frame/getitem_test.py | 9 +++++++++ 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index efc343177..a46c63945 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -176,6 +176,10 @@ def __getitem__( elif isinstance(item, tuple) and len(item) == 2: if isinstance(item[1], slice): columns = self.columns + if item[1] == slice(None): + if isinstance(item[0], Sequence) and len(item[0]) == 0: + return self._from_native_frame(self._native_frame.slice(0, 0)) + return self._from_native_frame(self._native_frame.take(item[0])) if isinstance(item[1].start, str) or isinstance(item[1].stop, str): start, stop, step = convert_str_slice_to_int_slice(item[1], columns) return self._from_native_frame( diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index aae86cef7..e808545ae 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -186,6 +186,8 @@ def __getitem__( elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], slice): columns = self._native_frame.columns + if item[1] == slice(None): + return self._from_native_frame(self._native_frame.iloc[item[0], :]) if isinstance(item[1].start, str) or isinstance(item[1].stop, str): start, stop, step = convert_str_slice_to_int_slice(item[1], columns) return self._from_native_frame( diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index a4e30ec63..0ef2f879d 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING from typing import Any +from typing import Sequence from narwhals._polars.namespace import PolarsNamespace from narwhals._polars.utils import convert_str_slice_to_int_slice @@ -117,6 +118,12 @@ def __getitem__(self, item: Any) -> Any: columns = self.columns if isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], slice): + if item[1] == slice(None): + if isinstance(item[0], Sequence) and not len(item[0]): + return self._from_native_frame(self._native_frame[0:0]) + return self._from_native_frame( + self._native_frame.__getitem__(item[0]) + ) if isinstance(item[1].start, str) or isinstance(item[1].stop, str): start, stop, step = convert_str_slice_to_int_slice(item[1], columns) return self._from_native_frame( diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 0ee778ab2..e4627484a 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -843,8 +843,6 @@ def __getitem__( ): if item[1] == slice(None) and item[0] == slice(None): return self - if item[1] == slice(None): - return self._from_compliant_dataframe(self._compliant_frame[item[0]]) return self._from_compliant_dataframe(self._compliant_frame[item]) if isinstance(item, str) or (isinstance(item, tuple) and len(item) == 2): return self._series( diff --git a/tests/frame/getitem_test.py b/tests/frame/getitem_test.py index 894555b3d..ce96c1b24 100644 --- a/tests/frame/getitem_test.py +++ b/tests/frame/getitem_test.py @@ -177,6 +177,15 @@ def test_slice_slice_columns(constructor_eager: Any) -> None: # noqa: PLR0915 result = df["a":"b"] # type: ignore[misc] expected = {"a": [1, 2, 3], "b": [4, 5, 6]} compare_dicts(result, expected) + result = df[(0, 1), :] + expected = {"a": [1, 2], "b": [4, 5], "c": [7, 8], "d": [1, 4]} + compare_dicts(result, expected) + result = df[[0, 1], :] + expected = {"a": [1, 2], "b": [4, 5], "c": [7, 8], "d": [1, 4]} + compare_dicts(result, expected) + result = df[[0, 1], df.columns] + expected = {"a": [1, 2], "b": [4, 5], "c": [7, 8], "d": [1, 4]} + compare_dicts(result, expected) def test_slice_invalid(constructor_eager: Any) -> None: From bfd42e5c97d038c93100dc28f53d9f57e88e1acf Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Thu, 3 Oct 2024 19:06:52 +0200 Subject: [PATCH 082/145] add nulls_last kw in dataframe sort (#1124) --- narwhals/_arrow/dataframe.py | 8 ++++-- narwhals/_dask/dataframe.py | 8 ++++-- narwhals/_pandas_like/dataframe.py | 8 ++++-- narwhals/dataframe.py | 43 ++++++++++++++++++------------ tests/frame/sort_test.py | 20 ++++++++++++++ 5 files changed, 64 insertions(+), 23 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index a46c63945..905ada598 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -395,7 +395,8 @@ def sort( self, by: str | Iterable[str], *more_by: str, - descending: bool | Sequence[bool] = False, + descending: bool | Sequence[bool], + nulls_last: bool, ) -> Self: flat_keys = flatten([*flatten([by]), *more_by]) df = self._native_frame @@ -408,7 +409,10 @@ def sort( (key, "descending" if is_descending else "ascending") for key, is_descending in zip(flat_keys, descending) ] - return self._from_native_frame(df.sort_by(sorting=sorting)) + + null_placement = "at_end" if nulls_last else "at_start" + + return self._from_native_frame(df.sort_by(sorting, null_placement=null_placement)) def to_pandas(self) -> Any: return self._native_frame.to_pandas() diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 916583eaa..31949cf22 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -206,7 +206,8 @@ def sort( self: Self, by: str | Iterable[str], *more_by: str, - descending: bool | Sequence[bool] = False, + descending: bool | Sequence[bool], + nulls_last: bool, ) -> Self: flat_keys = flatten([*flatten([by]), *more_by]) df = self._native_frame @@ -214,7 +215,10 @@ def sort( ascending: bool | list[bool] = not descending else: ascending = [not d for d in descending] - return self._from_native_frame(df.sort_values(flat_keys, ascending=ascending)) + na_position = "last" if nulls_last else "first" + return self._from_native_frame( + df.sort_values(flat_keys, ascending=ascending, na_position=na_position) + ) def join( self: Self, diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index e808545ae..b9788b533 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -413,7 +413,8 @@ def sort( self, by: str | Iterable[str], *more_by: str, - descending: bool | Sequence[bool] = False, + descending: bool | Sequence[bool], + nulls_last: bool, ) -> Self: flat_keys = flatten([*flatten([by]), *more_by]) df = self._native_frame @@ -421,7 +422,10 @@ def sort( ascending: bool | list[bool] = not descending else: ascending = [not d for d in descending] - return self._from_native_frame(df.sort_values(flat_keys, ascending=ascending)) + na_position = "last" if nulls_last else "first" + return self._from_native_frame( + df.sort_values(flat_keys, ascending=ascending, na_position=na_position) + ) # --- convert --- def collect(self) -> PandasLikeDataFrame: diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index e4627484a..e4ad31b38 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -170,9 +170,12 @@ def sort( by: str | Iterable[str], *more_by: str, descending: bool | Sequence[bool] = False, + nulls_last: bool = False, ) -> Self: return self._from_compliant_dataframe( - self._compliant_frame.sort(by, *more_by, descending=descending) + self._compliant_frame.sort( + by, *more_by, descending=descending, nulls_last=nulls_last + ) ) def join( @@ -1944,19 +1947,22 @@ def sort( by: str | Iterable[str], *more_by: str, descending: bool | Sequence[bool] = False, + nulls_last: bool = False, ) -> Self: r""" Sort the dataframe by the given columns. Arguments: by: Column(s) names to sort by. + *more_by: Additional columns to sort by, specified as positional arguments. + descending: Sort in descending order. When sorting by multiple columns, can be + specified per column by passing a sequence of booleans. + nulls_last: Place null values last. - *more_by: Additional columns to sort by, specified as positional - arguments. - - descending: Sort in descending order. When sorting by multiple - columns, can be specified per column by passing a - sequence of booleans. + Warning: + Unlike Polars, it is not possible to specify a sequence of booleans for + `nulls_last` in order to control per-column behaviour. Instead a single + boolean is applied for all `by` columns. Examples: >>> import narwhals as nw @@ -1996,7 +2002,7 @@ def sort( │ 2 ┆ 5.0 ┆ c │ └──────┴─────┴─────┘ """ - return super().sort(by, *more_by, descending=descending) + return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last) def join( self, @@ -3858,20 +3864,23 @@ def sort( by: str | Iterable[str], *more_by: str, descending: bool | Sequence[bool] = False, + nulls_last: bool = False, ) -> Self: r""" Sort the LazyFrame by the given columns. Arguments: - by: Column(s) to sort by. Accepts expression input. Strings are - parsed as column names. - - *more_by: Additional columns to sort by, specified as positional - arguments. + by: Column(s) names to sort by. + *more_by: Additional columns to sort by, specified as positional arguments. + descending: Sort in descending order. When sorting by multiple columns, can be + specified per column by passing a sequence of booleans. + nulls_last: Place null values last; can specify a single boolean applying to + all columns or a sequence of booleans for per-column control. - descending: Sort in descending order. When sorting by multiple - columns, can be specified per column by passing a - sequence of booleans. + Warning: + Unlike Polars, it is not possible to specify a sequence of booleans for + `nulls_last` in order to control per-column behaviour. Instead a single + boolean is applied for all `by` columns. Examples: >>> import narwhals as nw @@ -3911,7 +3920,7 @@ def sort( │ 2 ┆ 5.0 ┆ c │ └──────┴─────┴─────┘ """ - return super().sort(by, *more_by, descending=descending) + return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last) def join( self, diff --git a/tests/frame/sort_test.py b/tests/frame/sort_test.py index 06f5d079f..bea9177df 100644 --- a/tests/frame/sort_test.py +++ b/tests/frame/sort_test.py @@ -1,3 +1,7 @@ +from __future__ import annotations + +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import compare_dicts @@ -20,3 +24,19 @@ def test_sort(constructor: Constructor) -> None: "z": [8.0, 9.0, 7.0], } compare_dicts(result, expected) + + +@pytest.mark.parametrize( + ("nulls_last", "expected"), + [ + (True, {"a": [0, 2, 0, -1], "b": [3, 2, 1, float("nan")]}), + (False, {"a": [-1, 0, 2, 0], "b": [float("nan"), 3, 2, 1]}), + ], +) +def test_sort_nulls( + constructor: Constructor, *, nulls_last: bool, expected: dict[str, float] +) -> None: + data = {"a": [0, 0, 2, -1], "b": [1, 3, 2, None]} + df = nw.from_native(constructor(data)) + result = df.sort("b", descending=True, nulls_last=nulls_last) + compare_dicts(result, expected) From 1a9af18db2e0fbd4ca892a296f00d2121d69ecc2 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Thu, 3 Oct 2024 13:07:45 -0400 Subject: [PATCH 083/145] test: xfail tz_aware test for cuDF (#1118) * xfail tz_aware for cuDF * add GitHub issue --- tests/expr_and_series/cast_test.py | 6 ++++-- tests/frame/unpivot_test.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/expr_and_series/cast_test.py b/tests/expr_and_series/cast_test.py index dafe876ab..2229c8abb 100644 --- a/tests/expr_and_series/cast_test.py +++ b/tests/expr_and_series/cast_test.py @@ -193,8 +193,10 @@ class Banana: def test_cast_datetime_tz_aware( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor) or ( - "pyarrow_table" in str(constructor) and is_windows() + if ( + "dask" in str(constructor) + or "cudf" in str(constructor) # https://github.com/rapidsai/cudf/issues/16973 + or ("pyarrow_table" in str(constructor) and is_windows()) ): request.applymarker(pytest.mark.xfail) diff --git a/tests/frame/unpivot_test.py b/tests/frame/unpivot_test.py index 64d003bbf..2adad6465 100644 --- a/tests/frame/unpivot_test.py +++ b/tests/frame/unpivot_test.py @@ -54,6 +54,7 @@ def test_unpivot_var_value_names( request: pytest.FixtureRequest, ) -> None: if variable_name == "" and "cudf" in str(constructor): + # https://github.com/rapidsai/cudf/issues/16972 request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) From 2958cfde032da9127db6e58b0b5d5879ac2e4c53 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Fri, 4 Oct 2024 02:47:58 -0400 Subject: [PATCH 084/145] test: xfail `test_actual_object` for cuDF (#1129) --- tests/frame/schema_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/frame/schema_test.py b/tests/frame/schema_test.py index 3aa341e0f..b950f5286 100644 --- a/tests/frame/schema_test.py +++ b/tests/frame/schema_test.py @@ -61,7 +61,7 @@ def test_string_disguised_as_object() -> None: def test_actual_object(request: pytest.FixtureRequest, constructor_eager: Any) -> None: - if any(x in str(constructor_eager) for x in ("modin", "pyarrow_table")): + if any(x in str(constructor_eager) for x in ("modin", "pyarrow_table", "cudf")): request.applymarker(pytest.mark.xfail) class Foo: ... From f4ec002dfbd710b146851f4cd05eb5586c596f7a Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 4 Oct 2024 14:29:33 +0100 Subject: [PATCH 085/145] feat: make typing available under narwhals.stable.v1 (#1130) * make typing available under narwhals.stable.v1 * coverage * py-shiny compat --- narwhals/stable/v1/_dtypes.py | 86 +++++++++++++++++++++++++++++++++ narwhals/stable/v1/dtypes.py | 83 ++++++++++---------------------- narwhals/stable/v1/typing.py | 91 +++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 4 files changed, 204 insertions(+), 57 deletions(-) create mode 100644 narwhals/stable/v1/_dtypes.py create mode 100644 narwhals/stable/v1/typing.py diff --git a/narwhals/stable/v1/_dtypes.py b/narwhals/stable/v1/_dtypes.py new file mode 100644 index 000000000..13dd3237d --- /dev/null +++ b/narwhals/stable/v1/_dtypes.py @@ -0,0 +1,86 @@ +from narwhals.dtypes import Array +from narwhals.dtypes import Boolean +from narwhals.dtypes import Categorical +from narwhals.dtypes import Date +from narwhals.dtypes import Datetime as NwDatetime +from narwhals.dtypes import DType +from narwhals.dtypes import Duration as NwDuration +from narwhals.dtypes import Enum +from narwhals.dtypes import Float32 +from narwhals.dtypes import Float64 +from narwhals.dtypes import Int8 +from narwhals.dtypes import Int16 +from narwhals.dtypes import Int32 +from narwhals.dtypes import Int64 +from narwhals.dtypes import List +from narwhals.dtypes import NumericType +from narwhals.dtypes import Object +from narwhals.dtypes import String +from narwhals.dtypes import Struct +from narwhals.dtypes import UInt8 +from narwhals.dtypes import UInt16 +from narwhals.dtypes import UInt32 +from narwhals.dtypes import UInt64 +from narwhals.dtypes import Unknown + + +class Datetime(NwDatetime): + """ + Data type representing a calendar date and time of day. + + Arguments: + time_unit: Unit of time. Defaults to `'us'` (microseconds). + time_zone: Time zone string, as defined in zoneinfo (to see valid strings run + `import zoneinfo; zoneinfo.available_timezones()` for a full list). + + Notes: + Adapted from Polars implementation at: + https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L398-L457 + """ + + def __hash__(self) -> int: + return hash(self.__class__) + + +class Duration(NwDuration): + """ + Data type representing a time duration. + + Arguments: + time_unit: Unit of time. Defaults to `'us'` (microseconds). + + Notes: + Adapted from Polars implementation at: + https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L460-L502 + """ + + def __hash__(self) -> int: + return hash(self.__class__) + + +__all__ = [ + "Array", + "Boolean", + "Categorical", + "Date", + "Datetime", + "Duration", + "DType", + "Enum", + "Float32", + "Float64", + "Int8", + "Int16", + "Int32", + "Int64", + "List", + "NumericType", + "Object", + "String", + "Struct", + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "Unknown", +] diff --git a/narwhals/stable/v1/dtypes.py b/narwhals/stable/v1/dtypes.py index 0d1e58468..f36da9725 100644 --- a/narwhals/stable/v1/dtypes.py +++ b/narwhals/stable/v1/dtypes.py @@ -1,60 +1,27 @@ -from narwhals.dtypes import Array -from narwhals.dtypes import Boolean -from narwhals.dtypes import Categorical -from narwhals.dtypes import Date -from narwhals.dtypes import Datetime as NwDatetime -from narwhals.dtypes import Duration as NwDuration -from narwhals.dtypes import Enum -from narwhals.dtypes import Float32 -from narwhals.dtypes import Float64 -from narwhals.dtypes import Int8 -from narwhals.dtypes import Int16 -from narwhals.dtypes import Int32 -from narwhals.dtypes import Int64 -from narwhals.dtypes import List -from narwhals.dtypes import Object -from narwhals.dtypes import String -from narwhals.dtypes import Struct -from narwhals.dtypes import UInt8 -from narwhals.dtypes import UInt16 -from narwhals.dtypes import UInt32 -from narwhals.dtypes import UInt64 -from narwhals.dtypes import Unknown - - -class Datetime(NwDatetime): - """ - Data type representing a calendar date and time of day. - - Arguments: - time_unit: Unit of time. Defaults to `'us'` (microseconds). - time_zone: Time zone string, as defined in zoneinfo (to see valid strings run - `import zoneinfo; zoneinfo.available_timezones()` for a full list). - - Notes: - Adapted from Polars implementation at: - https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L398-L457 - """ - - def __hash__(self) -> int: - return hash(self.__class__) - - -class Duration(NwDuration): - """ - Data type representing a time duration. - - Arguments: - time_unit: Unit of time. Defaults to `'us'` (microseconds). - - Notes: - Adapted from Polars implementation at: - https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L460-L502 - """ - - def __hash__(self) -> int: - return hash(self.__class__) - +from narwhals.stable.v1._dtypes import Array +from narwhals.stable.v1._dtypes import Boolean +from narwhals.stable.v1._dtypes import Categorical +from narwhals.stable.v1._dtypes import Date +from narwhals.stable.v1._dtypes import Datetime +from narwhals.stable.v1._dtypes import DType +from narwhals.stable.v1._dtypes import Duration +from narwhals.stable.v1._dtypes import Enum +from narwhals.stable.v1._dtypes import Float32 +from narwhals.stable.v1._dtypes import Float64 +from narwhals.stable.v1._dtypes import Int8 +from narwhals.stable.v1._dtypes import Int16 +from narwhals.stable.v1._dtypes import Int32 +from narwhals.stable.v1._dtypes import Int64 +from narwhals.stable.v1._dtypes import List +from narwhals.stable.v1._dtypes import NumericType +from narwhals.stable.v1._dtypes import Object +from narwhals.stable.v1._dtypes import String +from narwhals.stable.v1._dtypes import Struct +from narwhals.stable.v1._dtypes import UInt8 +from narwhals.stable.v1._dtypes import UInt16 +from narwhals.stable.v1._dtypes import UInt32 +from narwhals.stable.v1._dtypes import UInt64 +from narwhals.stable.v1._dtypes import Unknown __all__ = [ "Array", @@ -63,6 +30,7 @@ def __hash__(self) -> int: "Date", "Datetime", "Duration", + "DType", "Enum", "Float32", "Float64", @@ -71,6 +39,7 @@ def __hash__(self) -> int: "Int32", "Int64", "List", + "NumericType", "Object", "String", "Struct", diff --git a/narwhals/stable/v1/typing.py b/narwhals/stable/v1/typing.py new file mode 100644 index 000000000..e8ab9e1ae --- /dev/null +++ b/narwhals/stable/v1/typing.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Protocol +from typing import TypeVar +from typing import Union + +if TYPE_CHECKING: + import sys + + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + + from narwhals.stable.v1 import DataFrame + from narwhals.stable.v1 import Expr + from narwhals.stable.v1 import LazyFrame + from narwhals.stable.v1 import Series + from narwhals.stable.v1 import dtypes + + # All dataframes supported by Narwhals have a + # `columns` property. Their similarities don't extend + # _that_ much further unfortunately... + class NativeFrame(Protocol): + @property + def columns(self) -> Any: ... + + def join(self, *args: Any, **kwargs: Any) -> Any: ... + + class DataFrameLike(Protocol): + def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ... + + +IntoExpr: TypeAlias = Union["Expr", str, "Series"] +"""Anything which can be converted to an expression.""" + +IntoDataFrame: TypeAlias = Union["NativeFrame", "DataFrame[Any]", "DataFrameLike"] +"""Anything which can be converted to a Narwhals DataFrame.""" + +IntoFrame: TypeAlias = Union[ + "NativeFrame", "DataFrame[Any]", "LazyFrame[Any]", "DataFrameLike" +] +"""Anything which can be converted to a Narwhals DataFrame or LazyFrame.""" + +Frame: TypeAlias = Union["DataFrame[Any]", "LazyFrame[Any]"] +"""Narwhals DataFrame or Narwhals LazyFrame""" + +# TypeVars for some of the above +IntoFrameT = TypeVar("IntoFrameT", bound="IntoFrame") +IntoDataFrameT = TypeVar("IntoDataFrameT", bound="IntoDataFrame") +FrameT = TypeVar("FrameT", bound="Frame") +DataFrameT = TypeVar("DataFrameT", bound="DataFrame[Any]") + + +class DTypes: + Int64: type[dtypes.Int64] + Int32: type[dtypes.Int32] + Int16: type[dtypes.Int16] + Int8: type[dtypes.Int8] + UInt64: type[dtypes.UInt64] + UInt32: type[dtypes.UInt32] + UInt16: type[dtypes.UInt16] + UInt8: type[dtypes.UInt8] + Float64: type[dtypes.Float64] + Float32: type[dtypes.Float32] + String: type[dtypes.String] + Boolean: type[dtypes.Boolean] + Object: type[dtypes.Object] + Categorical: type[dtypes.Categorical] + Enum: type[dtypes.Enum] + Datetime: type[dtypes.Datetime] + Duration: type[dtypes.Duration] + Date: type[dtypes.Date] + Struct: type[dtypes.Struct] + List: type[dtypes.List] + Array: type[dtypes.Array] + Unknown: type[dtypes.Unknown] + + +__all__ = [ + "IntoExpr", + "IntoDataFrame", + "IntoDataFrameT", + "IntoFrame", + "IntoFrameT", + "Frame", + "FrameT", + "DataFrameT", +] diff --git a/pyproject.toml b/pyproject.toml index 26cd3b8fd..287db7612 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -129,6 +129,7 @@ plugins = ["covdefaults"] [tool.coverage.report] omit = [ 'narwhals/typing.py', + 'narwhals/stable/v1/typing.py', # we can run this in every environment that we measure coverage on due to upper-bound constraits 'narwhals/_ibis/*', ] From 45554da9a3c7f1aa0b44bcf6144f9b9766e0ff8a Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 4 Oct 2024 14:29:58 +0100 Subject: [PATCH 086/145] docs: add Ibis to "related projects" (#1126) * docs: add Ibis to "related projects" * reword * missing word --- docs/roadmap_and_related.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/docs/roadmap_and_related.md b/docs/roadmap_and_related.md index 43580db6f..c976a5f1f 100644 --- a/docs/roadmap_and_related.md +++ b/docs/roadmap_and_related.md @@ -29,3 +29,36 @@ Array counterpart to the DataFrame API, see [here](https://data-apis.org/array-a Allows C extension modules to safely share pointers to C data structures with Python code and other C modules, encapsulating the pointer with a name and optional destructor to manage resources and ensure safe access, see [here](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) for details. Narwhals supports exporting a DataFrame via the Arrow PyCapsule Interface. + +### Ibis + +Pitched as "The portable Dataframe library", Ibis provides a Pythonic frontend +to various SQL (as well as Polars LazyFrame) engines. Some differences with Narwhals are: + +- Narwhals' main use case is for library maintainers wanting to support + different dataframe libraries without depending on any whilst keeping + things as lightweight as possible. Ibis is more targeted at end users + and aims to be thought of as a Dataframe library akin to + pandas / Polars / etc. +- Narwhals allows you to write a "Dataframe X in, Dataframe X out" function. + Ibis allows materialising to pandas, Polars (eager), and PyArrow, but has + no way to get back to the input type exactly (e.g. there's no way to + start with a Polars LazyFrame and get back a Polars LazyFrame) +- Narwhals respects input data types as much as possible, Ibis doesn't + support Categorical (nor does it distinguish between fixed-size-list and + list) +- Narwhals separates between lazy and eager APIs, with the eager API + provide very fine control over dataframe operations (slicing rows and + columns, iterating over rows, getting values out of the dataframe as + Python scalars). Ibis is more focused on lazy execution +- Ibis supports SQL engines (and can translate to SQL), + Narwhals is more focused traditional dataframes where row-order is defined + (although we are brainstorming a lazy-only level of support) +- Narwhals is extremely lightweight and comes with zero required dependencies, + Ibis requires pandas and PyArrow for all backends +- Narwhals supports Dask, whereas Ibis has deprecated support for it + +Although people often ask about the two tools, we consider them to be +very different and not in competition. Further efforts to clarify the +distinction are welcome 🙏! + From 915d84c6a2a9d4e22787ddcf94d61eef80c70b81 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 4 Oct 2024 15:27:20 +0100 Subject: [PATCH 087/145] ci: add py-shiny to downstream tests (#1132) * ci: add py-shiny to downstream tests * try removing shallow clone --- .github/workflows/downstream_tests.yml | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml index 0dcb78209..12f7f95b7 100644 --- a/.github/workflows/downstream_tests.yml +++ b/.github/workflows/downstream_tests.yml @@ -86,3 +86,49 @@ jobs: run: | cd scikit-lego pytest -n auto --disable-warnings --cov=sklego -m "not cvxpy and not formulaic and not umap" + + shiny: + strategy: + matrix: + python-version: ["3.12"] + os: [ubuntu-latest] + + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: "true" + cache-suffix: ${{ matrix.python-version }} + cache-dependency-glob: "**requirements*.txt" + - name: clone-shiny + run: | + git clone https://github.com/posit-dev/py-shiny.git + cd py-shiny + git log + - name: install-basics + run: uv pip install --upgrade tox virtualenv setuptools --system + - name: install-shiny-dev + run: | + cd py-shiny + uv pip install -e ".[dev,test]" --system + - name: install-narwhals-dev + run: | + uv pip uninstall narwhals --system + uv pip install -e . --system + - name: show-deps + run: uv pip freeze + - name: Run pytest + run: | + cd py-shiny + python tests/pytest/asyncio_prevent.py + pytest + - name: Run mypy + run: | + cd py-shiny + uv pip install mypy --system + mypy shiny From a2a22ce4b62d1f6032b5065a63cc5b1fddfea0f6 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Fri, 4 Oct 2024 15:47:03 +0100 Subject: [PATCH 088/145] feat: `Series.str.to_datetime` (#1131) * add series to_datetime to nw str namespace, add pyarrow example * add to_datetime to api-ref --- docs/api-reference/series_str.md | 1 + narwhals/expr.py | 14 ++++- narwhals/series.py | 57 +++++++++++++++++++ tests/expr_and_series/str/to_datetime_test.py | 16 ++++++ 4 files changed, 85 insertions(+), 3 deletions(-) diff --git a/docs/api-reference/series_str.md b/docs/api-reference/series_str.md index 7bbfccb67..bd737d8b7 100644 --- a/docs/api-reference/series_str.md +++ b/docs/api-reference/series_str.md @@ -13,6 +13,7 @@ - slice - starts_with - strip_chars + - to_datetime - tail show_source: false show_bases: false diff --git a/narwhals/expr.py b/narwhals/expr.py index 9f5e2d571..b43fbde00 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -2516,9 +2516,12 @@ def to_datetime(self, format: str) -> Expr: # noqa: A002 Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> df_pd = pd.DataFrame({"a": ["2020-01-01", "2020-01-02"]}) - >>> df_pl = pl.DataFrame({"a": ["2020-01-01", "2020-01-02"]}) + >>> data = ["2020-01-01", "2020-01-02"] + >>> df_pd = pd.DataFrame({"a": data}) + >>> df_pl = pl.DataFrame({"a": data}) + >>> df_pa = pa.table({"a": data}) We define a dataframe-agnostic function: @@ -2526,7 +2529,7 @@ def to_datetime(self, format: str) -> Expr: # noqa: A002 ... def func(df): ... return df.select(nw.col("a").str.to_datetime(format="%Y-%m-%d")) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or PyArrow: >>> func(df_pd) a @@ -2542,6 +2545,11 @@ def to_datetime(self, format: str) -> Expr: # noqa: A002 │ 2020-01-01 00:00:00 │ │ 2020-01-02 00:00:00 │ └─────────────────────┘ + >>> func(df_pa) + pyarrow.Table + a: timestamp[us] + ---- + a: [[2020-01-01 00:00:00.000000,2020-01-02 00:00:00.000000]] """ return self._expr.__class__( lambda plx: self._expr._call(plx).str.to_datetime(format=format) diff --git a/narwhals/series.py b/narwhals/series.py index 5a84a9a5d..bb9709068 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -3083,6 +3083,63 @@ def to_lowercase(self) -> Series: self._narwhals_series._compliant_series.str.to_lowercase() ) + def to_datetime(self, format: str) -> Series: # noqa: A002 + """ + Parse Series with strings to a Series with Datetime dtype. + + Notes: + pandas defaults to nanosecond time unit, Polars to microsecond. + Prior to pandas 2.0, nanoseconds were the only time unit supported + in pandas, with no ability to set any other one. The ability to + set the time unit in pandas, if the version permits, will arrive. + + Arguments: + format: Format to parse strings with. Must be passed, as different + dataframe libraries have different ways of auto-inferring + formats. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> data = ["2020-01-01", "2020-01-02"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... return s.str.to_datetime(format="%Y-%m-%d") + + We can then pass any supported library such as pandas, Polars, or PyArrow:: + + >>> func(s_pd) + 0 2020-01-01 + 1 2020-01-02 + dtype: datetime64[ns] + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [datetime[μs]] + [ + 2020-01-01 00:00:00 + 2020-01-02 00:00:00 + ] + >>> func(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2020-01-01 00:00:00.000000, + 2020-01-02 00:00:00.000000 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.to_datetime(format=format) + ) + class SeriesDateTimeNamespace: def __init__(self, series: Series) -> None: diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py index 4eb768465..a64a3c58b 100644 --- a/tests/expr_and_series/str/to_datetime_test.py +++ b/tests/expr_and_series/str/to_datetime_test.py @@ -1,3 +1,5 @@ +from typing import Any + import narwhals.stable.v1 as nw from tests.utils import Constructor @@ -18,3 +20,17 @@ def test_to_datetime(constructor: Constructor) -> None: .item(row=0, column="b") ) assert str(result) == expected + + +def test_to_datetime_series(constructor_eager: Any) -> None: + if "cudf" in str(constructor_eager): # pragma: no cover + expected = "2020-01-01T12:34:56.000000000" + else: + expected = "2020-01-01 12:34:56" + + result = ( + nw.from_native(constructor_eager(data), eager_only=True)["a"].str.to_datetime( + format="%Y-%m-%dT%H:%M:%S" + ) + ).item(0) + assert str(result) == expected From ca4bf5034e2f1247178ce4d2674f34fd7b7dc83e Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 4 Oct 2024 16:01:11 +0100 Subject: [PATCH 089/145] release: Bump version to 1.9.1 (#1134) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 2b725040f..0fc410132 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.9.0' +'1.9.1' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 585af0ffb..63dc6acd6 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -58,7 +58,7 @@ from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index -__version__ = "1.9.0" +__version__ = "1.9.1" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index 287db7612..48cae220d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.9.0" +version = "1.9.1" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From bb493c609bbda48ee475435eb629be81697f3964 Mon Sep 17 00:00:00 2001 From: Cheuk Ting Ho Date: Fri, 4 Oct 2024 17:58:40 +0200 Subject: [PATCH 090/145] fix: fixing unnesscary raise of mean_horizontal (#1082) --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- narwhals/_arrow/namespace.py | 25 +++++++++++++---- narwhals/_dask/namespace.py | 28 ++++++++++++++++--- narwhals/_dask/utils.py | 8 ++++++ narwhals/_pandas_like/namespace.py | 20 +++++++++---- narwhals/_polars/namespace.py | 11 +++----- tests/expr_and_series/mean_horizontal_test.py | 15 ++++++++++ 6 files changed, 85 insertions(+), 22 deletions(-) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index 3e7f4ecc9..7f0bcbec8 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -215,13 +215,26 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: ) def mean_horizontal(self, *exprs: IntoArrowExpr) -> IntoArrowExpr: - arrow_exprs = parse_into_exprs(*exprs, namespace=self) - total = reduce(lambda x, y: x + y, (e.fill_null(0.0) for e in arrow_exprs)) - n_non_zero = reduce( - lambda x, y: x + y, - ((1 - e.is_null().cast(self._dtypes.Int64())) for e in arrow_exprs), + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: ArrowDataFrame) -> list[ArrowSeries]: + series = (s.fill_null(0) for _expr in parsed_exprs for s in _expr._call(df)) + non_na = ( + 1 - s.is_null().cast(self._dtypes.Int64()) + for _expr in parsed_exprs + for s in _expr._call(df) + ) + return [ + reduce(lambda x, y: x + y, series) / reduce(lambda x, y: x + y, non_na) + ] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="mean_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), ) - return total / n_non_zero def concat( self, diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 82fc29490..580689c15 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -12,6 +12,8 @@ from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr from narwhals._dask.selectors import DaskSelectorNamespace +from narwhals._dask.utils import name_preserving_div +from narwhals._dask.utils import name_preserving_sum from narwhals._dask.utils import narwhals_to_native_dtype from narwhals._dask.utils import validate_comparand from narwhals._expression_parsing import combine_root_names @@ -231,10 +233,28 @@ def concat( raise NotImplementedError def mean_horizontal(self, *exprs: IntoDaskExpr) -> IntoDaskExpr: - dask_exprs = parse_into_exprs(*exprs, namespace=self) - total = reduce(lambda x, y: x + y, (e.fill_null(0.0) for e in dask_exprs)) - n_non_zero = reduce(lambda x, y: x + y, ((1 - e.is_null()) for e in dask_exprs)) - return total / n_non_zero + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + series = (s.fillna(0) for _expr in parsed_exprs for s in _expr._call(df)) + non_na = (1 - s.isna() for _expr in parsed_exprs for s in _expr._call(df)) + return [ + name_preserving_div( + reduce(name_preserving_sum, series), + reduce(name_preserving_sum, non_na), + ) + ] + + return DaskExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="mean_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + dtypes=self._dtypes, + ) def _create_expr_from_series(self, _: Any) -> NoReturn: msg = "`_create_expr_from_series` for DaskNamespace exists only for compatibility" diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index f7636bd5f..2ba7cdcbd 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -133,3 +133,11 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], dtypes: DTypes) -> Any: msg = f"Unknown dtype: {dtype}" # pragma: no cover raise AssertionError(msg) + + +def name_preserving_sum(s1: dask_expr.Series, s2: dask_expr.Series) -> dask_expr.Series: + return (s1 + s2).rename(s1.name) + + +def name_preserving_div(s1: dask_expr.Series, s2: dask_expr.Series) -> dask_expr.Series: + return (s1 / s2).rename(s1.name) diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 6aacf2856..35fd878b7 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -261,12 +261,22 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: ) def mean_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: - pandas_like_exprs = parse_into_exprs(*exprs, namespace=self) - total = reduce(lambda x, y: x + y, (e.fill_null(0.0) for e in pandas_like_exprs)) - n_non_zero = reduce( - lambda x, y: x + y, ((1 - e.is_null()) for e in pandas_like_exprs) + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = (s.fill_null(0) for _expr in parsed_exprs for s in _expr._call(df)) + non_na = (1 - s.is_null() for _expr in parsed_exprs for s in _expr._call(df)) + return [ + reduce(lambda x, y: x + y, series) / reduce(lambda x, y: x + y, non_na) + ] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="mean_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), ) - return total / n_non_zero def concat( self, diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index 21facd81f..c43acbb26 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -1,6 +1,5 @@ from __future__ import annotations -from functools import reduce from typing import TYPE_CHECKING from typing import Any from typing import Iterable @@ -107,16 +106,14 @@ def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr: polars_exprs = parse_into_exprs(*exprs, namespace=self) if self._backend_version < (0, 20, 8): # pragma: no cover - total = reduce(lambda x, y: x + y, (e.fill_null(0.0) for e in polars_exprs)) - n_non_zero = reduce( - lambda x, y: x + y, ((1 - e.is_null()) for e in polars_exprs) - ) return PolarsExpr( - total._native_expr / n_non_zero._native_expr, dtypes=self._dtypes + pl.sum_horizontal(e._native_expr for e in polars_exprs) + / pl.sum_horizontal(1 - e.is_null()._native_expr for e in polars_exprs), + dtypes=self._dtypes, ) return PolarsExpr( - pl.mean_horizontal([e._native_expr for e in polars_exprs]), + pl.mean_horizontal(e._native_expr for e in polars_exprs), dtypes=self._dtypes, ) diff --git a/tests/expr_and_series/mean_horizontal_test.py b/tests/expr_and_series/mean_horizontal_test.py index f4ad35b92..ce9ac8fe0 100644 --- a/tests/expr_and_series/mean_horizontal_test.py +++ b/tests/expr_and_series/mean_horizontal_test.py @@ -14,3 +14,18 @@ def test_meanh(constructor: Constructor, col_expr: Any) -> None: result = df.select(horizontal_mean=nw.mean_horizontal(col_expr, nw.col("b"))) expected = {"horizontal_mean": [2.5, 3.0, 6.0, float("nan")]} compare_dicts(result, expected) + + +def test_meanh_all(constructor: Constructor) -> None: + data = {"a": [2, 4, 6], "b": [10, 20, 30]} + df = nw.from_native(constructor(data)) + result = df.select(nw.mean_horizontal(nw.all())) + expected = { + "a": [6, 12, 18], + } + compare_dicts(result, expected) + result = df.select(c=nw.mean_horizontal(nw.all())) + expected = { + "c": [6, 12, 18], + } + compare_dicts(result, expected) From c1159d8c9e306660babcc1aa392c6e4e2bfef98a Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Sun, 6 Oct 2024 03:17:40 -0400 Subject: [PATCH 091/145] fix small docs typo (#1141) --- docs/backcompat.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/backcompat.md b/docs/backcompat.md index 3c4a19a7c..55f31aadf 100644 --- a/docs/backcompat.md +++ b/docs/backcompat.md @@ -74,8 +74,8 @@ users of `narwhals.stable.v1` will have their code unaffected. Which should you use? In general we recommend: - When prototyping, use `import narwhals as nw`, so you can iterate quickly. -- Once you're happy with what you've got and what to release something production-ready and stable, - when switch out your `import narwhals as nw` usage for `import narwhals.stable.v1 as nw`. +- Once you're happy with what you've got and want to release something production-ready and stable, + then switch out your `import narwhals as nw` usage for `import narwhals.stable.v1 as nw`. ## Exceptions From 9e1dbb6773464d2899f9da6fd5d0d7a98c441119 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Sun, 6 Oct 2024 03:18:48 -0400 Subject: [PATCH 092/145] update agg for cuDF (#1138) --- narwhals/_pandas_like/group_by.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index f20383460..366a52e1e 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -212,7 +212,9 @@ def agg_pandas( # noqa: PLR0915 result_aggs = result_simple_aggs else: # No aggregation provided - result_aggs = native_namespace.DataFrame(grouped.groups.keys(), columns=keys) + result_aggs = native_namespace.DataFrame( + list(grouped.groups.keys()), columns=keys + ) return from_dataframe(result_aggs.loc[:, output_names]) if dataframe_is_empty: From efc6a52265ea52f212718c7b6ababad38ab774fc Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sun, 6 Oct 2024 09:24:33 +0200 Subject: [PATCH 093/145] fix: pyarrow unpivot upcast numeric (#1140) * fix: pyarrow unpivot upcast numeric * pin min pyarrow version --- narwhals/_arrow/dataframe.py | 10 +++++++++- tests/frame/unpivot_test.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 905ada598..4bab24a28 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -682,6 +682,11 @@ def unpivot( n_rows = len(self) + promote_kwargs = ( + {"promote_options": "permissive"} + if self._backend_version >= (14, 0, 0) + else {} + ) return self._from_native_frame( pa.concat_tables( [ @@ -694,6 +699,9 @@ def unpivot( names=[*index_, variable_name, value_name], ) for on_col in on_ - ] + ], + **promote_kwargs, ) ) + # TODO(Unassigned): Even with promote_options="permissive", pyarrow does not + # upcast numeric to non-numeric (e.g. string) datatypes diff --git a/tests/frame/unpivot_test.py b/tests/frame/unpivot_test.py index 2adad6465..d169d5ca6 100644 --- a/tests/frame/unpivot_test.py +++ b/tests/frame/unpivot_test.py @@ -1,11 +1,19 @@ from __future__ import annotations +from typing import TYPE_CHECKING +from typing import Any + +import pyarrow as pa import pytest import narwhals.stable.v1 as nw +from narwhals.utils import parse_version from tests.utils import Constructor from tests.utils import compare_dicts +if TYPE_CHECKING: + from narwhals.stable.v1.dtypes import DType + data = { "a": ["x", "y", "z"], "b": [1, 3, 5], @@ -70,3 +78,29 @@ def test_unpivot_default_var_value_names(constructor: Constructor) -> None: result = df.unpivot(on=["b", "c"], index=["a"]) assert result.collect_schema().names()[-2:] == ["variable", "value"] + + +@pytest.mark.parametrize( + ("data", "expected_dtypes"), + [ + ( + {"idx": [0, 1], "a": [1, 2], "b": [1.5, 2.5]}, + [nw.Int64(), nw.String(), nw.Float64()], + ), + ], +) +def test_unpivot_mixed_types( + request: pytest.FixtureRequest, + constructor: Constructor, + data: dict[str, Any], + expected_dtypes: list[DType], +) -> None: + if "dask" in str(constructor) or ( + "pyarrow_table" in str(constructor) + and parse_version(pa.__version__) < parse_version("14.0.0") + ): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) + result = df.unpivot(on=["a", "b"], index="idx") + + assert result.collect_schema().dtypes() == expected_dtypes From 5aa4e122a2b03f38d6cb7292a797a77db5f08330 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sun, 6 Oct 2024 09:25:42 +0200 Subject: [PATCH 094/145] fix: when-then-otherwise lit string for arrow backend (#1137) * fix: when-then-otherwise lit string for arrow backend * rename test --- narwhals/_arrow/namespace.py | 13 +++++++++++-- tests/expr_and_series/when_test.py | 7 +++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index 7f0bcbec8..c0e86de61 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -11,6 +11,7 @@ from narwhals._arrow.expr import ArrowExpr from narwhals._arrow.selectors import ArrowSelectorNamespace from narwhals._arrow.series import ArrowSeries +from narwhals._arrow.utils import broadcast_series from narwhals._arrow.utils import horizontal_concat from narwhals._arrow.utils import vertical_concat from narwhals._expression_parsing import combine_root_names @@ -353,7 +354,8 @@ def __call__(self, df: ArrowDataFrame) -> list[ArrowSeries]: self._otherwise_value, namespace=plx )._call(df)[0] # type: ignore[arg-type] except TypeError: - # `self._otherwise_value` is a scalar and can't be converted to an expression + # `self._otherwise_value` is a scalar and can't be converted to an expression. + # Remark that string values _are_ converted into expressions! return [ value_series._from_native_series( pc.if_else( @@ -364,7 +366,14 @@ def __call__(self, df: ArrowDataFrame) -> list[ArrowSeries]: else: otherwise_series = cast(ArrowSeries, otherwise_series) condition = cast(ArrowSeries, condition) - return [value_series.zip_with(condition, otherwise_series)] + condition_native, otherwise_native = broadcast_series( + [condition, otherwise_series] + ) + return [ + value_series._from_native_series( + pc.if_else(condition_native, value_series_native, otherwise_native) + ) + ] def then(self, value: ArrowExpr | ArrowSeries | Any) -> ArrowThen: self._then_value = value diff --git a/tests/expr_and_series/when_test.py b/tests/expr_and_series/when_test.py index 993988744..6fabaa68b 100644 --- a/tests/expr_and_series/when_test.py +++ b/tests/expr_and_series/when_test.py @@ -138,3 +138,10 @@ def test_when_then_otherwise_into_expr(constructor: Constructor) -> None: result = df.select(nw.when(nw.col("a") > 1).then("c").otherwise("e")) expected = {"c": [7, 5, 6]} compare_dicts(result, expected) + + +def test_when_then_otherwise_lit_str(constructor: Constructor) -> None: + df = nw.from_native(constructor(data)) + result = df.select(nw.when(nw.col("a") > 1).then(nw.col("b")).otherwise(nw.lit("z"))) + expected = {"b": ["z", "b", "c"]} + compare_dicts(result, expected) From 3e0405d860b5bcd999ea2657b3f1a602f9e6c7dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Sun, 6 Oct 2024 10:13:18 +0200 Subject: [PATCH 095/145] feat: allow inspecting the inner type / length of nw.Array (#1136) --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- narwhals/_arrow/utils.py | 4 +++- narwhals/_duckdb/dataframe.py | 7 +++++-- narwhals/_pandas_like/utils.py | 5 ++++- narwhals/_polars/utils.py | 9 ++++++++- narwhals/dtypes.py | 30 ++++++++++++++++++++++++++- tests/dtypes_test.py | 37 ++++++++++++++++++++++++++++++++++ tests/frame/schema_test.py | 7 ++++--- 7 files changed, 90 insertions(+), 9 deletions(-) diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index bb5f3f7ff..e37cb093f 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -60,7 +60,9 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: if pa.types.is_list(dtype) or pa.types.is_large_list(dtype): return dtypes.List(native_to_narwhals_dtype(dtype.value_type, dtypes)) if pa.types.is_fixed_size_list(dtype): - return dtypes.Array() + return dtypes.Array( + native_to_narwhals_dtype(dtype.value_type, dtypes), dtype.list_size + ) return dtypes.Unknown() # pragma: no cover diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 95567283f..555555d4a 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -52,8 +52,11 @@ def map_duckdb_dtype_to_narwhals_dtype(duckdb_dtype: Any, dtypes: DTypes) -> DTy return dtypes.Struct() if match_ := re.match(r"(.*)\[\]$", duckdb_dtype): return dtypes.List(map_duckdb_dtype_to_narwhals_dtype(match_.group(1), dtypes)) - if re.match(r"\w+\[\d+\]", duckdb_dtype): - return dtypes.Array() + if match_ := re.match(r"(\w+)\[(\d+)\]", duckdb_dtype): + return dtypes.Array( + map_duckdb_dtype_to_narwhals_dtype(match_.group(1), dtypes), + int(match_.group(2)), + ) return dtypes.Unknown() diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index df87e6499..d7ecc98f2 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -285,7 +285,10 @@ def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType: arrow_native_to_narwhals_dtype(column.dtype.pyarrow_dtype.value_type, dtypes) ) if dtype.startswith("fixed_size_list"): - return dtypes.Array() + return dtypes.Array( + arrow_native_to_narwhals_dtype(column.dtype.pyarrow_dtype.value_type, dtypes), + column.dtype.pyarrow_dtype.list_size, + ) if dtype.startswith("struct"): return dtypes.Struct() if dtype == "object": diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 95b98b17a..d44535cc7 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -8,6 +8,8 @@ from narwhals.dtypes import DType from narwhals.typing import DTypes +from narwhals.utils import parse_version + def extract_native(obj: Any) -> Any: from narwhals._polars.dataframe import PolarsDataFrame @@ -77,7 +79,12 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: if dtype == pl.List: return dtypes.List(native_to_narwhals_dtype(dtype.inner, dtypes)) if dtype == pl.Array: - return dtypes.Array() + if parse_version(pl.__version__) < (1, 0): # pragma: no cover + return dtypes.Array( + native_to_narwhals_dtype(dtype.inner, dtypes), dtype.width + ) + else: + return dtypes.Array(native_to_narwhals_dtype(dtype.inner, dtypes), dtype.size) return dtypes.Unknown() diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index 6acfbb764..98d8c6914 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -200,7 +200,35 @@ def __repr__(self) -> str: return f"{class_name}({self.inner!r})" -class Array(DType): ... +class Array(DType): + def __init__(self, inner: DType | type[DType], width: int | None = None) -> None: + self.inner = inner + if width is None: + error = "`width` must be specified when initializing an `Array`" + raise TypeError(error) + self.width = width + + def __eq__(self, other: DType | type[DType]) -> bool: # type: ignore[override] + # This equality check allows comparison of type classes and type instances. + # If a parent type is not specific about its inner type, we infer it as equal: + # > array[i64] == array[i64] -> True + # > array[i64] == array[f32] -> False + # > array[i64] == array -> True + + # allow comparing object instances to class + if type(other) is type and issubclass(other, self.__class__): + return True + elif isinstance(other, self.__class__): + return self.inner == other.inner + else: + return False + + def __hash__(self) -> int: + return hash((self.__class__, self.inner, self.width)) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + return f"{class_name}({self.inner!r}, {self.width})" class Date(TemporalType): ... diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py index c64dfed51..c35507873 100644 --- a/tests/dtypes_test.py +++ b/tests/dtypes_test.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +import polars as pl import pyarrow as pa import pytest @@ -67,6 +68,42 @@ def test_list_valid() -> None: assert dtype in {nw.List(nw.List(nw.Int64))} +def test_array_valid() -> None: + dtype = nw.Array(nw.Int64, 2) + assert dtype == nw.Array(nw.Int64, 2) + assert dtype == nw.Array + assert dtype != nw.Array(nw.Float32, 2) + assert dtype != nw.Duration + assert repr(dtype) == "Array(, 2)" + dtype = nw.Array(nw.Array(nw.Int64, 2), 2) + assert dtype == nw.Array(nw.Array(nw.Int64, 2), 2) + assert dtype == nw.Array + assert dtype != nw.Array(nw.Array(nw.Float32, 2), 2) + assert dtype in {nw.Array(nw.Array(nw.Int64, 2), 2)} + + with pytest.raises( + TypeError, match="`width` must be specified when initializing an `Array`" + ): + dtype = nw.Array(nw.Int64) + + +@pytest.mark.skipif( + parse_version(pl.__version__) < (1,) or parse_version(pd.__version__) < (2, 2), + reason="`shape` is only available after 1.0", +) +def test_polars_2d_array() -> None: + df = pl.DataFrame( + {"a": [[[1, 2], [3, 4], [5, 6]]]}, schema={"a": pl.Array(pl.Int64, (3, 2))} + ) + assert nw.from_native(df).collect_schema()["a"] == nw.Array(nw.Array(nw.Int64, 2), 3) + assert nw.from_native(df.to_arrow()).collect_schema()["a"] == nw.Array( + nw.Array(nw.Int64, 2), 3 + ) + assert nw.from_native( + df.to_pandas(use_pyarrow_extension_array=True) + ).collect_schema()["a"] == nw.Array(nw.Array(nw.Int64, 2), 3) + + def test_second_time_unit() -> None: s = pd.Series(np.array([np.datetime64("2020-01-01", "s")])) result = nw.from_native(s, series_only=True) diff --git a/tests/frame/schema_test.py b/tests/frame/schema_test.py index b950f5286..cb5ddff19 100644 --- a/tests/frame/schema_test.py +++ b/tests/frame/schema_test.py @@ -213,22 +213,23 @@ def test_nested_dtypes() -> None: schema_overrides={"b": pl.Array(pl.Int64, 2)}, ).to_pandas(use_pyarrow_extension_array=True) nwdf = nw.from_native(df) + assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} df = pl.DataFrame( {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]}, schema_overrides={"b": pl.Array(pl.Int64, 2)}, ) nwdf = nw.from_native(df) - assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} + assert nwdf.schema == {"a": nw.List, "b": nw.Array(nw.Int64, 2), "c": nw.Struct} df = pl.DataFrame( {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]}, schema_overrides={"b": pl.Array(pl.Int64, 2)}, ).to_arrow() nwdf = nw.from_native(df) - assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} + assert nwdf.schema == {"a": nw.List, "b": nw.Array(nw.Int64, 2), "c": nw.Struct} df = duckdb.sql("select * from df") nwdf = nw.from_native(df) - assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} + assert nwdf.schema == {"a": nw.List, "b": nw.Array(nw.Int64, 2), "c": nw.Struct} def test_nested_dtypes_ibis() -> None: # pragma: no cover From 8f2e834ba27e7d565b16a3b0f44ad420e6bfc27f Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sun, 6 Oct 2024 11:20:49 +0200 Subject: [PATCH 096/145] feat: `concat_str` (#1128) --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- docs/api-reference/narwhals.md | 1 + narwhals/__init__.py | 2 + narwhals/_arrow/namespace.py | 41 ++++++++++++ narwhals/_dask/namespace.py | 49 ++++++++++++++ narwhals/_pandas_like/namespace.py | 55 ++++++++++++++++ narwhals/_polars/namespace.py | 59 +++++++++++++++++ narwhals/expr.py | 83 ++++++++++++++++++++++++ narwhals/stable/v1/__init__.py | 80 +++++++++++++++++++++++ tests/expr_and_series/concat_str_test.py | 58 +++++++++++++++++ 9 files changed, 428 insertions(+) create mode 100644 tests/expr_and_series/concat_str_test.py diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index 2700c48c7..39afb8e8b 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -11,6 +11,7 @@ Here are the top-level functions available in Narwhals. - any_horizontal - col - concat + - concat_str - from_dict - from_native - get_level diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 63dc6acd6..3e367460d 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -30,6 +30,7 @@ from narwhals.expr import all_horizontal from narwhals.expr import any_horizontal from narwhals.expr import col +from narwhals.expr import concat_str from narwhals.expr import len_ as len from narwhals.expr import lit from narwhals.expr import max @@ -80,6 +81,7 @@ "all_horizontal", "any_horizontal", "col", + "concat_str", "len", "lit", "min", diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index c0e86de61..c514110fc 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -298,6 +298,47 @@ def when( return ArrowWhen(condition, self._backend_version, dtypes=self._dtypes) + def concat_str( + self, + exprs: Iterable[IntoArrowExpr], + *more_exprs: IntoArrowExpr, + separator: str = "", + ignore_nulls: bool = False, + ) -> ArrowExpr: + import pyarrow.compute as pc # ignore-banned-import + + parsed_exprs: list[ArrowExpr] = [ + *parse_into_exprs(*exprs, namespace=self), + *parse_into_exprs(*more_exprs, namespace=self), + ] + + def func(df: ArrowDataFrame) -> list[ArrowSeries]: + series = ( + s._native_series + for _expr in parsed_exprs + for s in _expr.cast(self._dtypes.String())._call(df) + ) + null_handling = "skip" if ignore_nulls else "emit_null" + result_series = pc.binary_join_element_wise( + *series, separator, null_handling=null_handling + ) + return [ + ArrowSeries( + native_series=result_series, + name="", + backend_version=self._backend_version, + dtypes=self._dtypes, + ) + ] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="concat_str", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + ) + class ArrowWhen: def __init__( diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 580689c15..e0f2bbbde 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -299,6 +299,55 @@ def when( condition, self._backend_version, returns_scalar=False, dtypes=self._dtypes ) + def concat_str( + self, + exprs: Iterable[IntoDaskExpr], + *more_exprs: IntoDaskExpr, + separator: str = "", + ignore_nulls: bool = False, + ) -> DaskExpr: + parsed_exprs: list[DaskExpr] = [ + *parse_into_exprs(*exprs, namespace=self), + *parse_into_exprs(*more_exprs, namespace=self), + ] + + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + series = (s.astype(str) for _expr in parsed_exprs for s in _expr._call(df)) + null_mask = [s for _expr in parsed_exprs for s in _expr.is_null()._call(df)] + + if not ignore_nulls: + null_mask_result = reduce(lambda x, y: x | y, null_mask) + result = reduce(lambda x, y: x + separator + y, series).where( + ~null_mask_result, None + ) + else: + init_value, *values = [ + s.where(~nm, "") for s, nm in zip(series, null_mask) + ] + + separators = ( + nm.map({True: "", False: separator}, meta=str) + for nm in null_mask[:-1] + ) + result = reduce( + lambda x, y: x + y, + (s + v for s, v in zip(separators, values)), + init_value, + ) + + return [result] + + return DaskExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="concat_str", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + dtypes=self._dtypes, + ) + class DaskWhen: def __init__( diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 35fd878b7..555f9efcf 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -326,6 +326,61 @@ def when( condition, self._implementation, self._backend_version, dtypes=self._dtypes ) + def concat_str( + self, + exprs: Iterable[IntoPandasLikeExpr], + *more_exprs: IntoPandasLikeExpr, + separator: str = "", + ignore_nulls: bool = False, + ) -> PandasLikeExpr: + parsed_exprs: list[PandasLikeExpr] = [ + *parse_into_exprs(*exprs, namespace=self), + *parse_into_exprs(*more_exprs, namespace=self), + ] + + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = ( + s + for _expr in parsed_exprs + for s in _expr.cast(self._dtypes.String())._call(df) + ) + null_mask = [s for _expr in parsed_exprs for s in _expr.is_null()._call(df)] + + if not ignore_nulls: + null_mask_result = reduce(lambda x, y: x | y, null_mask) + result = reduce(lambda x, y: x + separator + y, series).zip_with( + ~null_mask_result, None + ) + else: + init_value, *values = [ + s.zip_with(~nm, "") for s, nm in zip(series, null_mask) + ] + + sep_array = init_value.__class__._from_iterable( + data=[separator] * len(init_value), + name="sep", + index=init_value._native_series.index, + implementation=self._implementation, + backend_version=self._backend_version, + dtypes=self._dtypes, + ) + separators = (sep_array.zip_with(~nm, "") for nm in null_mask[:-1]) + result = reduce( + lambda x, y: x + y, + (s + v for s, v in zip(separators, values)), + init_value, + ) + + return [result] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="concat_str", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + ) + class PandasWhen: def __init__( diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index c43acbb26..4eb8451b7 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -117,6 +117,65 @@ def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr: dtypes=self._dtypes, ) + def concat_str( + self, + exprs: Iterable[IntoPolarsExpr], + *more_exprs: IntoPolarsExpr, + separator: str = "", + ignore_nulls: bool = False, + ) -> PolarsExpr: + import polars as pl # ignore-banned-import() + + from narwhals._polars.expr import PolarsExpr + + pl_exprs: list[pl.Expr] = [ + expr._native_expr + for expr in ( + *parse_into_exprs(*exprs, namespace=self), + *parse_into_exprs(*more_exprs, namespace=self), + ) + ] + + if self._backend_version < (0, 20, 6): # pragma: no cover + null_mask = [expr.is_null() for expr in pl_exprs] + sep = pl.lit(separator) + + if not ignore_nulls: + null_mask_result = pl.any_horizontal(*null_mask) + output_expr = pl.reduce( + lambda x, y: x.cast(pl.String()) + sep + y.cast(pl.String()), # type: ignore[arg-type,return-value] + pl_exprs, + ) + result = pl.when(~null_mask_result).then(output_expr) + else: + init_value, *values = [ + pl.when(nm).then(pl.lit("")).otherwise(expr.cast(pl.String())) + for expr, nm in zip(pl_exprs, null_mask) + ] + separators = [ + pl.when(~nm).then(sep).otherwise(pl.lit("")) for nm in null_mask[:-1] + ] + + result = pl.fold( # type: ignore[assignment] + acc=init_value, + function=lambda x, y: x + y, + exprs=[s + v for s, v in zip(separators, values)], + ) + + return PolarsExpr( + result, + dtypes=self._dtypes, + ) + + return PolarsExpr( + pl.concat_str( + pl_exprs, + separator=separator, + ignore_nulls=ignore_nulls, + ), + dtypes=self._dtypes, + ) + @property def selectors(self) -> PolarsSelectors: return PolarsSelectors(self._dtypes) diff --git a/narwhals/expr.py b/narwhals/expr.py index b43fbde00..59e1ff76c 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -4411,6 +4411,89 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: ) +def concat_str( + exprs: IntoExpr | Iterable[IntoExpr], + *more_exprs: IntoExpr, + separator: str = "", + ignore_nulls: bool = False, +) -> Expr: + r""" + Horizontally concatenate columns into a single string column. + + Arguments: + exprs: Columns to concatenate into a single string column. Accepts expression + input. Strings are parsed as column names, other non-expression inputs are + parsed as literals. Non-`String` columns are cast to `String`. + *more_exprs: Additional columns to concatenate into a single string column, + specified as positional arguments. + separator: String that will be used to separate the values of each column. + ignore_nulls: Ignore null values (default is `False`). + If set to `False`, null values will be propagated and if the row contains any + null values, the output is null. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = { + ... "a": [1, 2, 3], + ... "b": ["dogs", "cats", None], + ... "c": ["play", "swim", "walk"], + ... } + + We define a dataframe-agnostic function that computes the horizontal string + concatenation of different columns + + >>> @nw.narwhalify + ... def func(df): + ... return df.select( + ... nw.concat_str( + ... [ + ... nw.col("a") * 2, + ... nw.col("b"), + ... nw.col("c"), + ... ], + ... separator=" ", + ... ).alias("full_sentence") + ... ) + + We can then pass either pandas, Polars or PyArrow to `func`: + + >>> func(pd.DataFrame(data)) + full_sentence + 0 2 dogs play + 1 4 cats swim + 2 None + + >>> func(pl.DataFrame(data)) + shape: (3, 1) + ┌───────────────┐ + │ full_sentence │ + │ --- │ + │ str │ + ╞═══════════════╡ + │ 2 dogs play │ + │ 4 cats swim │ + │ null │ + └───────────────┘ + + >>> func(pa.table(data)) + pyarrow.Table + full_sentence: string + ---- + full_sentence: [["2 dogs play","4 cats swim",null]] + """ + return Expr( + lambda plx: plx.concat_str( + [extract_compliant(plx, v) for v in flatten([exprs])], + *[extract_compliant(plx, v) for v in more_exprs], + separator=separator, + ignore_nulls=ignore_nulls, + ) + ) + + __all__ = [ "Expr", ] diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 84b94aebf..1c3a3bd1f 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -1649,6 +1649,85 @@ def concat( return _stableify(nw.concat(items, how=how)) # type: ignore[no-any-return] +def concat_str( + exprs: IntoExpr | Iterable[IntoExpr], + *more_exprs: IntoExpr, + separator: str = "", + ignore_nulls: bool = False, +) -> Expr: + r""" + Horizontally concatenate columns into a single string column. + + Arguments: + exprs: Columns to concatenate into a single string column. Accepts expression + input. Strings are parsed as column names, other non-expression inputs are + parsed as literals. Non-`String` columns are cast to `String`. + *more_exprs: Additional columns to concatenate into a single string column, + specified as positional arguments. + separator: String that will be used to separate the values of each column. + ignore_nulls: Ignore null values (default is `False`). + If set to `False`, null values will be propagated and if the row contains any + null values, the output is null. + + Examples: + >>> import narwhals.stable.v1 as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = { + ... "a": [1, 2, 3], + ... "b": ["dogs", "cats", None], + ... "c": ["play", "swim", "walk"], + ... } + + We define a dataframe-agnostic function that computes the horizontal string + concatenation of different columns + + >>> @nw.narwhalify + ... def func(df): + ... return df.select( + ... nw.concat_str( + ... [ + ... nw.col("a") * 2, + ... nw.col("b"), + ... nw.col("c"), + ... ], + ... separator=" ", + ... ).alias("full_sentence") + ... ) + + We can then pass either pandas, Polars or PyArrow to `func`: + + >>> func(pd.DataFrame(data)) + full_sentence + 0 2 dogs play + 1 4 cats swim + 2 None + + >>> func(pl.DataFrame(data)) + shape: (3, 1) + ┌───────────────┐ + │ full_sentence │ + │ --- │ + │ str │ + ╞═══════════════╡ + │ 2 dogs play │ + │ 4 cats swim │ + │ null │ + └───────────────┘ + + >>> func(pa.table(data)) + pyarrow.Table + full_sentence: string + ---- + full_sentence: [["2 dogs play","4 cats swim",null]] + """ + + return _stableify( + nw.concat_str(exprs, *more_exprs, separator=separator, ignore_nulls=ignore_nulls) + ) + + def is_ordered_categorical(series: Series) -> bool: """ Return whether indices of categories are semantically meaningful. @@ -2070,6 +2149,7 @@ def from_dict( "all_horizontal", "any_horizontal", "col", + "concat_str", "nth", "len", "lit", diff --git a/tests/expr_and_series/concat_str_test.py b/tests/expr_and_series/concat_str_test.py new file mode 100644 index 000000000..5a28085a8 --- /dev/null +++ b/tests/expr_and_series/concat_str_test.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import Constructor +from tests.utils import compare_dicts + +data = { + "a": [1, 2, 3], + "b": ["dogs", "cats", None], + "c": ["play", "swim", "walk"], +} + + +@pytest.mark.parametrize( + ("ignore_nulls", "expected"), + [ + (True, ["2 dogs play", "4 cats swim", "6 walk"]), + (False, ["2 dogs play", "4 cats swim", None]), + ], +) +def test_concat_str( + constructor: Constructor, *, ignore_nulls: bool, expected: list[str] +) -> None: + df = nw.from_native(constructor(data)) + result = ( + df.select( + "a", + nw.concat_str( + [ + nw.col("a") * 2, + nw.col("b"), + nw.col("c"), + ], + separator=" ", + ignore_nulls=ignore_nulls, # default behavior is False + ).alias("full_sentence"), + ) + .sort("a") + .select("full_sentence") + ) + compare_dicts(result, {"full_sentence": expected}) + result = ( + df.select( + "a", + nw.concat_str( + nw.col("a") * 2, + nw.col("b"), + nw.col("c"), + separator=" ", + ignore_nulls=ignore_nulls, # default behavior is False + ).alias("full_sentence"), + ) + .sort("a") + .select("full_sentence") + ) + compare_dicts(result, {"full_sentence": expected}) From a2ef4fe29ec61ed3f9218e6064f07aa240776a40 Mon Sep 17 00:00:00 2001 From: Alexander Ivanov <54221777+aivanoved@users.noreply.github.com> Date: Sun, 6 Oct 2024 11:47:34 +0200 Subject: [PATCH 097/145] perf: Impove `pandas`-backend indexing (#993) --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- narwhals/_pandas_like/dataframe.py | 10 +++++----- narwhals/_pandas_like/expr.py | 2 +- narwhals/_pandas_like/namespace.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index b9788b533..6e96fb7ce 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -104,7 +104,7 @@ def get_column(self, name: str) -> PandasLikeSeries: from narwhals._pandas_like.series import PandasLikeSeries return PandasLikeSeries( - self._native_frame.loc[:, name], + self._native_frame[name], implementation=self._implementation, backend_version=self._backend_version, dtypes=self._dtypes, @@ -157,7 +157,7 @@ def __getitem__( from narwhals._pandas_like.series import PandasLikeSeries return PandasLikeSeries( - self._native_frame.loc[:, item], + self._native_frame[item], implementation=self._implementation, backend_version=self._backend_version, dtypes=self._dtypes, @@ -276,7 +276,7 @@ def iter_rows( @property def schema(self) -> dict[str, DType]: return { - col: native_to_narwhals_dtype(self._native_frame.loc[:, col], self._dtypes) + col: native_to_narwhals_dtype(self._native_frame[col], self._dtypes) for col in self._native_frame.columns } @@ -382,7 +382,7 @@ def with_columns( ) ) else: - to_concat.append(self._native_frame.loc[:, name]) + to_concat.append(self._native_frame[name]) to_concat.extend( validate_dataframe_comparand(index, new_column_name_to_new_column_map[s]) for s in new_column_name_to_new_column_map @@ -637,7 +637,7 @@ def to_dict(self, *, as_series: bool = False) -> dict[str, Any]: # TODO(Unassigned): should this return narwhals series? return { col: PandasLikeSeries( - self._native_frame.loc[:, col], + self._native_frame[col], implementation=self._implementation, backend_version=self._backend_version, dtypes=self._dtypes, diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 52c237aaa..4e3011446 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -70,7 +70,7 @@ def from_column_names( def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: return [ PandasLikeSeries( - df._native_frame.loc[:, column_name], + df._native_frame[column_name], implementation=df._implementation, backend_version=df._backend_version, dtypes=df._dtypes, diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 555f9efcf..8f6e17518 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -119,7 +119,7 @@ def all(self) -> PandasLikeExpr: return PandasLikeExpr( lambda df: [ PandasLikeSeries( - df._native_frame.loc[:, column_name], + df._native_frame[column_name], implementation=self._implementation, backend_version=self._backend_version, dtypes=self._dtypes, From f9a5c607d76d8891565c98f5f24bc1514f4fc188 Mon Sep 17 00:00:00 2001 From: Alessandro Miola <37796412+AlessandroMiola@users.noreply.github.com> Date: Sun, 6 Oct 2024 15:44:45 +0200 Subject: [PATCH 098/145] docs: add missing Series str methods in api reference (#1143) * docs: add to_lowercase and to_uppercase to Series.str docs * chore: add Series.str methods to check on api-reference docs --- docs/api-reference/series_str.md | 2 ++ utils/check_api_reference.py | 26 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/docs/api-reference/series_str.md b/docs/api-reference/series_str.md index bd737d8b7..9ae01890d 100644 --- a/docs/api-reference/series_str.md +++ b/docs/api-reference/series_str.md @@ -14,6 +14,8 @@ - starts_with - strip_chars - to_datetime + - to_lowercase + - to_uppercase - tail show_source: false show_bases: false diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index d3f30aaa2..60e968a85 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -159,7 +159,31 @@ # dt -# str +# Series.str methods +series_str_methods = [ + i + for i in nw.from_native(pl.Series(), series_only=True).str.__dir__() + if not i[0].isupper() and i[0] != "_" +] + +with open("docs/api-reference/series_str.md") as fd: + content = fd.read() + +documented = [ + remove_prefix(i, " - ") + for i in content.splitlines() + if i.startswith(" - ") and not i.startswith(" - _") +] + +if missing := set(series_str_methods).difference(documented): + print("Series.str: not documented") # noqa: T201 + print(missing) # noqa: T201 + ret = 1 + +if extra := set(documented).difference(series_str_methods): + print("Series.str: outdated") # noqa: T201 + print(extra) # noqa: T201 + ret = 1 # Check Expr vs Series expr = [i for i in nw.Expr(lambda: 0).__dir__() if not i[0].isupper() and i[0] != "_"] From a2c1a10e1d7220c55288a067a6ac9f19b9cf0104 Mon Sep 17 00:00:00 2001 From: Alessandro Miola <37796412+AlessandroMiola@users.noreply.github.com> Date: Mon, 7 Oct 2024 18:49:22 +0200 Subject: [PATCH 099/145] docs: add namespace methods in `check_api_reference.py` (#1144) * docs: add check on Series.dt methods being in api-reference * docs: add check on Series.cat methods being in api-reference * chore: remove extra blanks * docs: add check on documented, yet no longer existent methods on dtype * docs: add check on Expr.{cat, dt, name, str} methods being in api-reference * chore: remove useless check and move checks on Series.{cat, dt, str} higher up * docs: apply suggested changes --- utils/check_api_reference.py | 76 ++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 60e968a85..7bc590423 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -121,6 +121,31 @@ print(extra) # noqa: T201 ret = 1 +# Series.{cat, dt, str} methods +for namespace in NAMESPACES.difference({"name"}): + series_methods = [ + i + for i in getattr( + nw.from_native(pl.Series(), series_only=True), namespace + ).__dir__() + if not i[0].isupper() and i[0] != "_" + ] + with open(f"docs/api-reference/series_{namespace}.md") as fd: + content = fd.read() + documented = [ + remove_prefix(i, " - ") + for i in content.splitlines() + if i.startswith(" - ") and not i.startswith(" - _") + ] + if missing := set(series_methods).difference(documented): + print(f"Series.{namespace}: not documented") # noqa: T201 + print(missing) # noqa: T201 + ret = 1 + if extra := set(documented).difference(series_methods): + print(f"Series.{namespace}: outdated") # noqa: T201 + print(extra) # noqa: T201 + ret = 1 + # Expr methods expr_methods = [ i for i in nw.Expr(lambda: 0).__dir__() if not i[0].isupper() and i[0] != "_" @@ -141,6 +166,29 @@ print(extra) # noqa: T201 ret = 1 +# Expr.{cat, dt, name, str} methods +for namespace in NAMESPACES: + expr_methods = [ + i + for i in getattr(nw.Expr(lambda: 0), namespace).__dir__() + if not i[0].isupper() and i[0] != "_" + ] + with open(f"docs/api-reference/expr_{namespace}.md") as fd: + content = fd.read() + documented = [ + remove_prefix(i, " - ") + for i in content.splitlines() + if i.startswith(" - ") + ] + if missing := set(expr_methods).difference(documented): + print(f"Expr.{namespace}: not documented") # noqa: T201 + print(missing) # noqa: T201 + ret = 1 + if extra := set(documented).difference(expr_methods): + print(f"Expr.{namespace}: outdated") # noqa: T201 + print(extra) # noqa: T201 + ret = 1 + # DTypes dtypes = [ i for i in nw.dtypes.__dir__() if i[0].isupper() and not i.isupper() and i[0] != "_" @@ -156,32 +204,8 @@ print("Dtype: not documented") # noqa: T201 print(missing) # noqa: T201 ret = 1 - -# dt - -# Series.str methods -series_str_methods = [ - i - for i in nw.from_native(pl.Series(), series_only=True).str.__dir__() - if not i[0].isupper() and i[0] != "_" -] - -with open("docs/api-reference/series_str.md") as fd: - content = fd.read() - -documented = [ - remove_prefix(i, " - ") - for i in content.splitlines() - if i.startswith(" - ") and not i.startswith(" - _") -] - -if missing := set(series_str_methods).difference(documented): - print("Series.str: not documented") # noqa: T201 - print(missing) # noqa: T201 - ret = 1 - -if extra := set(documented).difference(series_str_methods): - print("Series.str: outdated") # noqa: T201 +if extra := set(documented).difference(dtypes): + print("Dtype: outdated") # noqa: T201 print(extra) # noqa: T201 ret = 1 From 0bcb9a9c8ebce5c9b8010428bbb2dbb50656e327 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 7 Oct 2024 21:17:18 +0200 Subject: [PATCH 100/145] feat: allow `format=None` in `str.to_datetime` (#1145) * WIP * allow str.to_datetime with format=None --- narwhals/_arrow/expr.py | 2 +- narwhals/_arrow/series.py | 6 ++- narwhals/_dask/expr.py | 2 +- narwhals/_pandas_like/expr.py | 2 +- narwhals/_pandas_like/series.py | 2 +- narwhals/expr.py | 11 +++-- narwhals/series.py | 11 +++-- tests/expr_and_series/str/to_datetime_test.py | 40 +++++++++++++++++++ 8 files changed, 63 insertions(+), 13 deletions(-) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 6d1001c11..c70425efe 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -540,7 +540,7 @@ def slice(self, offset: int, length: int | None = None) -> ArrowExpr: self._expr, "str", "slice", offset, length ) - def to_datetime(self, format: str | None = None) -> ArrowExpr: # noqa: A002 + def to_datetime(self: Self, format: str | None) -> ArrowExpr: # noqa: A002 return reuse_series_namespace_implementation( self._expr, "str", diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 183cf37b7..507023498 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -1033,9 +1033,13 @@ def slice(self: Self, offset: int, length: int | None = None) -> ArrowSeries: ), ) - def to_datetime(self: Self, format: str | None = None) -> ArrowSeries: # noqa: A002 + def to_datetime(self: Self, format: str | None) -> ArrowSeries: # noqa: A002 import pyarrow.compute as pc # ignore-banned-import() + if format is None: + msg = "`format` is required for pyarrow backend." + raise ValueError(msg) + return self._arrow_series._from_native_series( pc.strptime(self._arrow_series._native_series, format=format, unit="us") ) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index d8d86692e..10b95bc89 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -811,7 +811,7 @@ def slice(self, offset: int, length: int | None = None) -> DaskExpr: returns_scalar=False, ) - def to_datetime(self, format: str | None = None) -> DaskExpr: # noqa: A002 + def to_datetime(self: Self, format: str | None) -> DaskExpr: # noqa: A002 import dask.dataframe as dd # ignore-banned-import() return self._expr._from_call( diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 4e3011446..2ebadbe16 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -486,7 +486,7 @@ def slice(self, offset: int, length: int | None = None) -> PandasLikeExpr: self._expr, "str", "slice", offset, length ) - def to_datetime(self, format: str | None = None) -> PandasLikeExpr: # noqa: A002 + def to_datetime(self: Self, format: str | None) -> PandasLikeExpr: # noqa: A002 return reuse_series_namespace_implementation( self._expr, "str", diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 74e1c492d..9cca66405 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -757,7 +757,7 @@ def slice(self, offset: int, length: int | None = None) -> PandasLikeSeries: self._pandas_series._native_series.str.slice(start=offset, stop=stop), ) - def to_datetime(self, format: str | None = None) -> PandasLikeSeries: # noqa: A002 + def to_datetime(self: Self, format: str | None) -> PandasLikeSeries: # noqa: A002 return self._pandas_series._from_native_series( to_datetime(self._pandas_series._implementation)( self._pandas_series._native_series, format=format diff --git a/narwhals/expr.py b/narwhals/expr.py index 59e1ff76c..8446d81c3 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -2498,7 +2498,7 @@ def tail(self, n: int = 5) -> Expr: """ return self._expr.__class__(lambda plx: self._expr._call(plx).str.slice(-n)) - def to_datetime(self, format: str) -> Expr: # noqa: A002 + def to_datetime(self: Self, format: str | None = None) -> Expr: # noqa: A002 """ Convert to Datetime dtype. @@ -2508,10 +2508,13 @@ def to_datetime(self, format: str) -> Expr: # noqa: A002 in pandas, with no ability to set any other one. The ability to set the time unit in pandas, if the version permits, will arrive. + Warning: + As different backends auto-infer format in different ways, if `format=None` + there is no guarantee that the result will be equal. + Arguments: - format: Format to parse strings with. Must be passed, as different - dataframe libraries have different ways of auto-inferring - formats. + format: Format to use for conversion. If set to None (default), the format is + inferred from the data. Examples: >>> import pandas as pd diff --git a/narwhals/series.py b/narwhals/series.py index bb9709068..1753598c1 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -3083,7 +3083,7 @@ def to_lowercase(self) -> Series: self._narwhals_series._compliant_series.str.to_lowercase() ) - def to_datetime(self, format: str) -> Series: # noqa: A002 + def to_datetime(self: Self, format: str | None = None) -> Series: # noqa: A002 """ Parse Series with strings to a Series with Datetime dtype. @@ -3093,10 +3093,13 @@ def to_datetime(self, format: str) -> Series: # noqa: A002 in pandas, with no ability to set any other one. The ability to set the time unit in pandas, if the version permits, will arrive. + Warning: + As different backends auto-infer format in different ways, if `format=None` + there is no guarantee that the result will be equal. + Arguments: - format: Format to parse strings with. Must be passed, as different - dataframe libraries have different ways of auto-inferring - formats. + format: Format to use for conversion. If set to None (default), the format is + inferred from the data. Examples: >>> import pandas as pd diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py index a64a3c58b..8474357e0 100644 --- a/tests/expr_and_series/str/to_datetime_test.py +++ b/tests/expr_and_series/str/to_datetime_test.py @@ -1,5 +1,7 @@ from typing import Any +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor @@ -34,3 +36,41 @@ def test_to_datetime_series(constructor_eager: Any) -> None: ) ).item(0) assert str(result) == expected + + +def test_to_datetime_infer_fmt( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyarrow_table" in str(constructor): + request.applymarker(pytest.mark.xfail) + + if "cudf" in str(constructor): # pragma: no cover + expected = "2020-01-01T12:34:56.000000000" + else: + expected = "2020-01-01 12:34:56" + + result = ( + nw.from_native(constructor(data)) + .lazy() + .select(b=nw.col("a").str.to_datetime()) + .collect() + .item(row=0, column="b") + ) + assert str(result) == expected + + +def test_to_datetime_series_infer_fmt( + request: pytest.FixtureRequest, constructor_eager: Any +) -> None: + if "pyarrow_table" in str(constructor_eager): + request.applymarker(pytest.mark.xfail) + + if "cudf" in str(constructor_eager): # pragma: no cover + expected = "2020-01-01T12:34:56.000000000" + else: + expected = "2020-01-01 12:34:56" + + result = ( + nw.from_native(constructor_eager(data), eager_only=True)["a"].str.to_datetime() + ).item(0) + assert str(result) == expected From f2b7a4094f9a5baffa92347959a90e17d7a132dc Mon Sep 17 00:00:00 2001 From: Alessandro Miola <37796412+AlessandroMiola@users.noreply.github.com> Date: Mon, 7 Oct 2024 21:23:53 +0200 Subject: [PATCH 101/145] docs: compare series vs expr namespace methods in `check_api_reference.py` (#1150) * docs: restore same order in docs between Expr.str and Series.str methods * docs: apply suggestion --- docs/api-reference/expr_str.md | 2 +- docs/api-reference/series_str.md | 2 +- utils/check_api_reference.py | 28 +++++++++++++++++++++++++--- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/docs/api-reference/expr_str.md b/docs/api-reference/expr_str.md index 15d66ac99..01966a87e 100644 --- a/docs/api-reference/expr_str.md +++ b/docs/api-reference/expr_str.md @@ -8,9 +8,9 @@ - ends_with - head - len_chars - - slice - replace - replace_all + - slice - starts_with - strip_chars - tail diff --git a/docs/api-reference/series_str.md b/docs/api-reference/series_str.md index 9ae01890d..d4ca36027 100644 --- a/docs/api-reference/series_str.md +++ b/docs/api-reference/series_str.md @@ -13,9 +13,9 @@ - slice - starts_with - strip_chars + - tail - to_datetime - to_lowercase - to_uppercase - - tail show_source: false show_bases: false diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 7bc590423..69c310439 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -216,14 +216,36 @@ for i in nw.from_native(pl.Series(), series_only=True).__dir__() if not i[0].isupper() and i[0] != "_" ] - if missing := set(expr).difference(series).difference(EXPR_ONLY_METHODS): - print("In expr but not in series") # noqa: T201 + print("In Expr but not in Series") # noqa: T201 print(missing) # noqa: T201 ret = 1 if extra := set(series).difference(expr).difference(SERIES_ONLY_METHODS): - print("in series but not in expr") # noqa: T201 + print("In Series but not in Expr") # noqa: T201 print(extra) # noqa: T201 ret = 1 +# Check Expr vs Series internal methods +for namespace in NAMESPACES.difference({"name"}): + expr_internal = [ + i + for i in getattr(nw.Expr(lambda: 0), namespace).__dir__() + if not i[0].isupper() and i[0] != "_" + ] + series_internal = [ + i + for i in getattr( + nw.from_native(pl.Series(), series_only=True), namespace + ).__dir__() + if not i[0].isupper() and i[0] != "_" + ] + if missing := set(expr_internal).difference(series_internal): + print(f"In Expr.{namespace} but not in Series.{namespace}") # noqa: T201 + print(missing) # noqa: T201 + ret = 1 + if extra := set(series_internal).difference(expr_internal): + print(f"In Series.{namespace} but not in Expr.{namespace}") # noqa: T201 + print(extra) # noqa: T201 + ret = 1 + sys.exit(ret) From df4b372668aa21a6e282151e0f991e08bd9f7d7c Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Tue, 8 Oct 2024 02:49:51 -0400 Subject: [PATCH 102/145] test: xfail `test_unpivot_mixed_types` for cuDF (#1153) --- tests/frame/unpivot_test.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/frame/unpivot_test.py b/tests/frame/unpivot_test.py index d169d5ca6..33f7eaca0 100644 --- a/tests/frame/unpivot_test.py +++ b/tests/frame/unpivot_test.py @@ -95,9 +95,13 @@ def test_unpivot_mixed_types( data: dict[str, Any], expected_dtypes: list[DType], ) -> None: - if "dask" in str(constructor) or ( - "pyarrow_table" in str(constructor) - and parse_version(pa.__version__) < parse_version("14.0.0") + if ( + "dask" in str(constructor) + or "cudf" in str(constructor) + or ( + "pyarrow_table" in str(constructor) + and parse_version(pa.__version__) < parse_version("14.0.0") + ) ): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) From b961d4eb982b945fcd8efb84f896c275086296c0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Oct 2024 10:32:59 +0200 Subject: [PATCH 103/145] [pre-commit.ci] pre-commit autoupdate (#1152) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.6.7 → v0.6.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.7...v0.6.9) - [github.com/pre-commit/pre-commit-hooks: v4.6.0 → v5.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.6.0...v5.0.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a9f01f566..89d05e542 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.6.7' + rev: 'v0.6.9' hooks: # Run the formatter. - id: ruff-format @@ -47,7 +47,7 @@ repos: additional_dependencies: - black==22.12.0 - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: name-tests-test exclude: ^tests/utils\.py \ No newline at end of file From ee5af31b4143a418919347c3a13667a6ca8688f2 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 9 Oct 2024 09:23:56 +0200 Subject: [PATCH 104/145] feat: add dt.replace_time_zone (#1142) --- .github/workflows/pytest.yml | 1 + docs/api-reference/expr_dt.md | 2 + docs/api-reference/series_dt.md | 2 + narwhals/_arrow/expr.py | 10 ++ narwhals/_arrow/series.py | 25 ++++ narwhals/_dask/expr.py | 28 ++++ narwhals/_pandas_like/expr.py | 10 ++ narwhals/_pandas_like/series.py | 18 +++ narwhals/_pandas_like/utils.py | 20 +-- narwhals/dataframe.py | 3 +- narwhals/expr.py | 113 ++++++++++++++++ narwhals/series.py | 109 +++++++++++++++ .../expr_and_series/convert_time_zone_test.py | 120 +++++++++++++++++ .../expr_and_series/replace_time_zone_test.py | 125 ++++++++++++++++++ 14 files changed, 577 insertions(+), 9 deletions(-) create mode 100644 tests/expr_and_series/convert_time_zone_test.py create mode 100644 tests/expr_and_series/replace_time_zone_test.py diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index bf4f7d39a..20058a435 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -31,6 +31,7 @@ jobs: - name: Run pytest run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=85 - name: Run doctests + if: startsWith(matrix.os, 'windows') != true run: pytest narwhals --doctest-modules pytest-windows: diff --git a/docs/api-reference/expr_dt.md b/docs/api-reference/expr_dt.md index e04e92889..5c9ab41f3 100644 --- a/docs/api-reference/expr_dt.md +++ b/docs/api-reference/expr_dt.md @@ -4,6 +4,7 @@ handler: python options: members: + - convert_time_zone - date - year - month @@ -15,6 +16,7 @@ - millisecond - microsecond - nanosecond + - replace_time_zone - total_minutes - total_seconds - total_milliseconds diff --git a/docs/api-reference/series_dt.md b/docs/api-reference/series_dt.md index ba342ad30..c92592411 100644 --- a/docs/api-reference/series_dt.md +++ b/docs/api-reference/series_dt.md @@ -4,6 +4,7 @@ handler: python options: members: + - convert_time_zone - date - year - month @@ -15,6 +16,7 @@ - millisecond - microsecond - nanosecond + - replace_time_zone - total_minutes - total_seconds - total_milliseconds diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index c70425efe..55c529d30 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -410,6 +410,16 @@ def to_string(self: Self, format: str) -> ArrowExpr: # noqa: A002 self._expr, "dt", "to_string", format ) + def replace_time_zone(self: Self, time_zone: str | None) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._expr, "dt", "replace_time_zone", time_zone + ) + + def convert_time_zone(self: Self, time_zone: str) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._expr, "dt", "convert_time_zone", time_zone + ) + def date(self: Self) -> ArrowExpr: return reuse_series_namespace_implementation(self._expr, "dt", "date") diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 507023498..65a393ca9 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -755,6 +755,31 @@ def to_string(self: Self, format: str) -> ArrowSeries: # noqa: A002 pc.strftime(self._arrow_series._native_series, format) ) + def replace_time_zone(self: Self, time_zone: str | None) -> ArrowSeries: + import pyarrow.compute as pc # ignore-banned-import() + + if time_zone is not None: + result = pc.assume_timezone( + pc.local_timestamp(self._arrow_series._native_series), time_zone + ) + else: + result = pc.local_timestamp(self._arrow_series._native_series) + return self._arrow_series._from_native_series(result) + + def convert_time_zone(self: Self, time_zone: str) -> ArrowSeries: + import pyarrow as pa # ignore-banned-import + + if self._arrow_series.dtype.time_zone is None: # type: ignore[attr-defined] + result = self.replace_time_zone("UTC")._native_series.cast( + pa.timestamp(self._arrow_series._native_series.type.unit, time_zone) + ) + else: + result = self._arrow_series._native_series.cast( + pa.timestamp(self._arrow_series._native_series.type.unit, time_zone) + ) + + return self._arrow_series._from_native_series(result) + def date(self: Self) -> ArrowSeries: import pyarrow as pa # ignore-banned-import() diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 10b95bc89..693fcad5e 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -10,6 +10,7 @@ from narwhals._dask.utils import add_row_index from narwhals._dask.utils import maybe_evaluate from narwhals._dask.utils import narwhals_to_native_dtype +from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals.utils import generate_unique_token if TYPE_CHECKING: @@ -925,6 +926,33 @@ def to_string(self, format: str) -> DaskExpr: # noqa: A002 returns_scalar=False, ) + def replace_time_zone(self, time_zone: str | None) -> DaskExpr: + return self._expr._from_call( + lambda _input, _time_zone: _input.dt.tz_localize(None).dt.tz_localize( + _time_zone + ) + if _time_zone is not None + else _input.dt.tz_localize(None), + "tz_localize", + time_zone, + returns_scalar=False, + ) + + def convert_time_zone(self, time_zone: str) -> DaskExpr: + def func(s: dask_expr.Series, time_zone: str) -> dask_expr.Series: + dtype = native_to_narwhals_dtype(s, self._expr._dtypes) + if dtype.time_zone is None: # type: ignore[attr-defined] + return s.dt.tz_localize("UTC").dt.tz_convert(time_zone) + else: + return s.dt.tz_convert(time_zone) + + return self._expr._from_call( + func, + "tz_convert", + time_zone, + returns_scalar=False, + ) + def total_minutes(self) -> DaskExpr: return self._expr._from_call( lambda _input: _input.dt.total_seconds() // 60, diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 2ebadbe16..07ba3e56d 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -572,6 +572,16 @@ def to_string(self, format: str) -> PandasLikeExpr: # noqa: A002 self._expr, "dt", "to_string", format ) + def replace_time_zone(self, time_zone: str | None) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._expr, "dt", "replace_time_zone", time_zone + ) + + def convert_time_zone(self, time_zone: str) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._expr, "dt", "convert_time_zone", time_zone + ) + class PandasLikeExprNameNamespace: def __init__(self: Self, expr: PandasLikeExpr) -> None: diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 9cca66405..2532aea8f 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -928,3 +928,21 @@ def to_string(self, format: str) -> PandasLikeSeries: # noqa: A002 return self._pandas_series._from_native_series( self._pandas_series._native_series.dt.strftime(format) ) + + def replace_time_zone(self, time_zone: str | None) -> PandasLikeSeries: + if time_zone is not None: + result = self._pandas_series._native_series.dt.tz_localize( + None + ).dt.tz_localize(time_zone) + else: + result = self._pandas_series._native_series.dt.tz_localize(None) + return self._pandas_series._from_native_series(result) + + def convert_time_zone(self, time_zone: str) -> PandasLikeSeries: + if self._pandas_series.dtype.time_zone is None: # type: ignore[attr-defined] + result = self._pandas_series._native_series.dt.tz_localize( + "UTC" + ).dt.tz_convert(time_zone) + else: + result = self._pandas_series._native_series.dt.tz_convert(time_zone) + return self._pandas_series._from_native_series(result) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index d7ecc98f2..381a78c8d 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -218,8 +218,8 @@ def set_axis( return obj.set_axis(index, axis=0, **kwargs) # type: ignore[attr-defined, no-any-return] -def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType: - dtype = str(column.dtype) +def native_to_narwhals_dtype(native_column: Any, dtypes: DTypes) -> DType: + dtype = str(native_column.dtype) pd_datetime_rgx = ( r"^datetime64\[(?Ps|ms|us|ns)(?:, (?P[a-zA-Z\/]+))?\]$" @@ -282,26 +282,30 @@ def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType: return dtypes.Date() if dtype.startswith(("large_list", "list")): return dtypes.List( - arrow_native_to_narwhals_dtype(column.dtype.pyarrow_dtype.value_type, dtypes) + arrow_native_to_narwhals_dtype( + native_column.dtype.pyarrow_dtype.value_type, dtypes + ) ) if dtype.startswith("fixed_size_list"): return dtypes.Array( - arrow_native_to_narwhals_dtype(column.dtype.pyarrow_dtype.value_type, dtypes), - column.dtype.pyarrow_dtype.list_size, + arrow_native_to_narwhals_dtype( + native_column.dtype.pyarrow_dtype.value_type, dtypes + ), + native_column.dtype.pyarrow_dtype.list_size, ) if dtype.startswith("struct"): return dtypes.Struct() if dtype == "object": if ( # pragma: no cover TODO(unassigned): why does this show as uncovered? - idx := getattr(column, "first_valid_index", lambda: None)() - ) is not None and isinstance(column.loc[idx], str): + idx := getattr(native_column, "first_valid_index", lambda: None)() + ) is not None and isinstance(native_column.loc[idx], str): # Infer based on first non-missing value. # For pandas pre 3.0, this isn't perfect. # After pandas 3.0, pandas has a dedicated string dtype # which is inferred by default. return dtypes.String() else: - df = column.to_frame() + df = native_column.to_frame() if hasattr(df, "__dataframe__"): from narwhals._interchange.dataframe import ( map_interchange_dtype_to_narwhals_dtype, diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index e4ad31b38..b0ac1c329 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -7,6 +7,7 @@ from typing import Iterable from typing import Iterator from typing import Literal +from typing import NoReturn from typing import Sequence from typing import TypeVar from typing import overload @@ -2787,7 +2788,7 @@ def __repr__(self) -> str: # pragma: no cover + "┘" ) - def __getitem__(self, item: str | slice) -> Series | Self: + def __getitem__(self, item: str | slice) -> NoReturn: msg = "Slicing is not supported on LazyFrame" raise TypeError(msg) diff --git a/narwhals/expr.py b/narwhals/expr.py index 8446d81c3..abf9c1043 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -3501,6 +3501,119 @@ def to_string(self, format: str) -> Expr: # noqa: A002 lambda plx: self._expr._call(plx).dt.to_string(format) ) + def replace_time_zone(self, time_zone: str | None) -> Expr: + """ + Replace time zone. + + Arguments: + time_zone: Target time zone. + + Examples: + >>> from datetime import datetime, timezone + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = { + ... "a": [ + ... datetime(2024, 1, 1, tzinfo=timezone.utc), + ... datetime(2024, 1, 2, tzinfo=timezone.utc), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.col("a").dt.replace_time_zone("Asia/Kathmandu")) + + We can then pass pandas / PyArrow / Polars / any other supported library: + + >>> func(df_pd) + a + 0 2024-01-01 00:00:00+05:45 + 1 2024-01-02 00:00:00+05:45 + >>> func(df_pl) + shape: (2, 1) + ┌──────────────────────────────┐ + │ a │ + │ --- │ + │ datetime[μs, Asia/Kathmandu] │ + ╞══════════════════════════════╡ + │ 2024-01-01 00:00:00 +0545 │ + │ 2024-01-02 00:00:00 +0545 │ + └──────────────────────────────┘ + >>> func(df_pa) # doctest:+SKIP + pyarrow.Table + a: timestamp[us, tz=Asia/Kathmandu] + ---- + a: [[2023-12-31 18:15:00.000000Z,2024-01-01 18:15:00.000000Z]] + """ + return self._expr.__class__( + lambda plx: self._expr._call(plx).dt.replace_time_zone(time_zone) + ) + + def convert_time_zone(self, time_zone: str) -> Expr: + """ + Convert to a new time zone. + + Arguments: + time_zone: Target time zone. + + Examples: + >>> from datetime import datetime, timezone + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = { + ... "a": [ + ... datetime(2024, 1, 1, tzinfo=timezone.utc), + ... datetime(2024, 1, 2, tzinfo=timezone.utc), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.col("a").dt.convert_time_zone("Asia/Kathmandu")) + + We can then pass pandas / PyArrow / Polars / any other supported library: + + >>> func(df_pd) + a + 0 2024-01-01 05:45:00+05:45 + 1 2024-01-02 05:45:00+05:45 + >>> func(df_pl) + shape: (2, 1) + ┌──────────────────────────────┐ + │ a │ + │ --- │ + │ datetime[μs, Asia/Kathmandu] │ + ╞══════════════════════════════╡ + │ 2024-01-01 05:45:00 +0545 │ + │ 2024-01-02 05:45:00 +0545 │ + └──────────────────────────────┘ + >>> func(df_pa) # doctest:+SKIP + pyarrow.Table + a: timestamp[us, tz=Asia/Kathmandu] + ---- + a: [[2024-01-01 00:00:00.000000Z,2024-01-02 00:00:00.000000Z]] + """ + if time_zone is None: + msg = "Target `time_zone` cannot be `None` in `convert_time_zone`. Please use `replace_time_zone(None)` if you want to remove the time zone." + raise TypeError(msg) + return self._expr.__class__( + lambda plx: self._expr._call(plx).dt.convert_time_zone(time_zone) + ) + class ExprNameNamespace: def __init__(self: Self, expr: Expr) -> None: diff --git a/narwhals/series.py b/narwhals/series.py index 1753598c1..a9eb51a42 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -3880,3 +3880,112 @@ def to_string(self, format: str) -> Series: # noqa: A002 return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.to_string(format) ) + + def replace_time_zone(self, time_zone: str | None) -> Series: + """ + Replace time zone. + + Arguments: + time_zone: Target time zone. + + Examples: + >>> from datetime import datetime, timezone + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = [ + ... datetime(2024, 1, 1, tzinfo=timezone.utc), + ... datetime(2024, 1, 2, tzinfo=timezone.utc), + ... ] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + Let's define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... return s.dt.replace_time_zone("Asia/Kathmandu") + + We can then pass pandas / PyArrow / Polars / any other supported library: + + >>> func(s_pd) + 0 2024-01-01 00:00:00+05:45 + 1 2024-01-02 00:00:00+05:45 + dtype: datetime64[ns, Asia/Kathmandu] + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [datetime[μs, Asia/Kathmandu]] + [ + 2024-01-01 00:00:00 +0545 + 2024-01-02 00:00:00 +0545 + ] + >>> func(s_pa) # doctest: +SKIP + + [ + [ + 2023-12-31 18:15:00.000000Z, + 2024-01-01 18:15:00.000000Z + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.replace_time_zone(time_zone) + ) + + def convert_time_zone(self, time_zone: str) -> Series: + """ + Convert time zone. + + Arguments: + time_zone: Target time zone. + + Examples: + >>> from datetime import datetime, timezone + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = [ + ... datetime(2024, 1, 1, tzinfo=timezone.utc), + ... datetime(2024, 1, 2, tzinfo=timezone.utc), + ... ] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + Let's define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... return s.dt.convert_time_zone("Asia/Kathmandu") + + We can then pass pandas / PyArrow / Polars / any other supported library: + + >>> func(s_pd) + 0 2024-01-01 05:45:00+05:45 + 1 2024-01-02 05:45:00+05:45 + dtype: datetime64[ns, Asia/Kathmandu] + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [datetime[μs, Asia/Kathmandu]] + [ + 2024-01-01 05:45:00 +0545 + 2024-01-02 05:45:00 +0545 + ] + >>> func(s_pa) # doctest: +SKIP + + [ + [ + 2024-01-01 00:00:00.000000Z, + 2024-01-02 00:00:00.000000Z + ] + ] + """ + if time_zone is None: + msg = "Target `time_zone` cannot be `None` in `convert_time_zone`. Please use `replace_time_zone(None)` if you want to remove the time zone." + raise TypeError(msg) + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.convert_time_zone(time_zone) + ) diff --git a/tests/expr_and_series/convert_time_zone_test.py b/tests/expr_and_series/convert_time_zone_test.py new file mode 100644 index 000000000..bc5d176ba --- /dev/null +++ b/tests/expr_and_series/convert_time_zone_test.py @@ -0,0 +1,120 @@ +from datetime import datetime +from datetime import timezone +from typing import Any + +import pandas as pd +import polars as pl +import pyarrow as pa +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version +from tests.utils import Constructor +from tests.utils import compare_dicts +from tests.utils import is_windows + + +def test_convert_time_zone( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) or ( + "pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2, 2) + ): + request.applymarker(pytest.mark.xfail) + data = { + "a": [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + ] + } + df = nw.from_native(constructor(data)) + result = df.select(nw.col("a").dt.convert_time_zone("Asia/Kathmandu")) + result_dtype = result.collect_schema()["a"] + assert result_dtype == nw.Datetime + assert result_dtype.time_zone == "Asia/Kathmandu" # type: ignore[attr-defined] + result_str = result.select(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M%z")) + expected = {"a": ["2020-01-01T05:45+0545", "2020-01-02T05:45+0545"]} + compare_dicts(result_str, expected) + + +def test_convert_time_zone_series( + constructor_eager: Any, request: pytest.FixtureRequest +) -> None: + if ( + any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows() + ) or ( + "pandas_pyarrow" in str(constructor_eager) + and parse_version(pd.__version__) < (2, 2) + ): + request.applymarker(pytest.mark.xfail) + data = { + "a": [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + ] + } + df = nw.from_native(constructor_eager(data), eager_only=True) + result = df.select(df["a"].dt.convert_time_zone("Asia/Kathmandu")) + result_dtype = result.collect_schema()["a"] + assert result_dtype == nw.Datetime + assert result_dtype.time_zone == "Asia/Kathmandu" # type: ignore[attr-defined] + result_str = result.select(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M%z")) + expected = {"a": ["2020-01-01T05:45+0545", "2020-01-02T05:45+0545"]} + compare_dicts(result_str, expected) + + +def test_convert_time_zone_from_none( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ( + (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) + or ( + "pandas_pyarrow" in str(constructor) + and parse_version(pd.__version__) < (2, 2) + ) + or ("pyarrow_table" in str(constructor) and parse_version(pa.__version__) < (12,)) + ): + request.applymarker(pytest.mark.xfail) + if "polars" in str(constructor) and parse_version(pl.__version__) < (0, 20, 7): + # polars used to disallow this + request.applymarker(pytest.mark.xfail) + data = { + "a": [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + ] + } + df = nw.from_native(constructor(data)) + result = df.select( + nw.col("a").dt.replace_time_zone(None).dt.convert_time_zone("Asia/Kathmandu") + ) + result_dtype = result.collect_schema()["a"] + assert result_dtype == nw.Datetime + assert result_dtype.time_zone == "Asia/Kathmandu" # type: ignore[attr-defined] + result_str = result.select(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M%z")) + expected = {"a": ["2020-01-01T05:45+0545", "2020-01-02T05:45+0545"]} + compare_dicts(result_str, expected) + + +def test_convert_time_zone_to_none(constructor: Constructor) -> None: + data = { + "a": [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + ] + } + df = nw.from_native(constructor(data)) + with pytest.raises(TypeError, match="Target `time_zone` cannot be `None`"): + df.select(nw.col("a").dt.convert_time_zone(None)) # type: ignore[arg-type] + + +def test_convert_time_zone_to_none_series(constructor_eager: Any) -> None: + data = { + "a": [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + ] + } + df = nw.from_native(constructor_eager(data)) + with pytest.raises(TypeError, match="Target `time_zone` cannot be `None`"): + df["a"].dt.convert_time_zone(None) # type: ignore[arg-type] diff --git a/tests/expr_and_series/replace_time_zone_test.py b/tests/expr_and_series/replace_time_zone_test.py new file mode 100644 index 000000000..560fcfe84 --- /dev/null +++ b/tests/expr_and_series/replace_time_zone_test.py @@ -0,0 +1,125 @@ +from datetime import datetime +from datetime import timezone +from typing import Any + +import pandas as pd +import pyarrow as pa +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version +from tests.utils import Constructor +from tests.utils import compare_dicts +from tests.utils import is_windows + + +def test_replace_time_zone( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ( + (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) + or ("pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2,)) + or ("pyarrow_table" in str(constructor) and parse_version(pa.__version__) < (12,)) + ): + request.applymarker(pytest.mark.xfail) + data = { + "a": [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + ] + } + df = nw.from_native(constructor(data)) + result = df.select(nw.col("a").dt.replace_time_zone("Asia/Kathmandu")) + result_dtype = result.collect_schema()["a"] + assert result_dtype == nw.Datetime + assert result_dtype.time_zone == "Asia/Kathmandu" # type: ignore[attr-defined] + result_str = result.select(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M%z")) + expected = {"a": ["2020-01-01T00:00+0545", "2020-01-02T00:00+0545"]} + compare_dicts(result_str, expected) + + +def test_replace_time_zone_none( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ( + (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) + or ("pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2,)) + or ("pyarrow_table" in str(constructor) and parse_version(pa.__version__) < (12,)) + ): + request.applymarker(pytest.mark.xfail) + data = { + "a": [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + ] + } + df = nw.from_native(constructor(data)) + result = df.select(nw.col("a").dt.replace_time_zone(None)) + result_dtype = result.collect_schema()["a"] + assert result_dtype == nw.Datetime + assert result_dtype.time_zone is None # type: ignore[attr-defined] + result_str = result.select(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M")) + expected = {"a": ["2020-01-01T00:00", "2020-01-02T00:00"]} + compare_dicts(result_str, expected) + + +def test_replace_time_zone_series( + constructor_eager: Any, request: pytest.FixtureRequest +) -> None: + if ( + (any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows()) + or ( + "pandas_pyarrow" in str(constructor_eager) + and parse_version(pd.__version__) < (2,) + ) + or ( + "pyarrow_table" in str(constructor_eager) + and parse_version(pa.__version__) < (12,) + ) + ): + request.applymarker(pytest.mark.xfail) + data = { + "a": [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + ] + } + df = nw.from_native(constructor_eager(data), eager_only=True) + result = df.select(df["a"].dt.replace_time_zone("Asia/Kathmandu")) + result_dtype = result.collect_schema()["a"] + assert result_dtype == nw.Datetime + assert result_dtype.time_zone == "Asia/Kathmandu" # type: ignore[attr-defined] + result_str = result.select(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M%z")) + expected = {"a": ["2020-01-01T00:00+0545", "2020-01-02T00:00+0545"]} + compare_dicts(result_str, expected) + + +def test_replace_time_zone_none_series( + constructor_eager: Any, request: pytest.FixtureRequest +) -> None: + if ( + (any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows()) + or ( + "pandas_pyarrow" in str(constructor_eager) + and parse_version(pd.__version__) < (2,) + ) + or ( + "pyarrow_table" in str(constructor_eager) + and parse_version(pa.__version__) < (12,) + ) + ): + request.applymarker(pytest.mark.xfail) + data = { + "a": [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + ] + } + df = nw.from_native(constructor_eager(data), eager_only=True) + result = df.select(df["a"].dt.replace_time_zone(None)) + result_dtype = result.collect_schema()["a"] + assert result_dtype == nw.Datetime + assert result_dtype.time_zone is None # type: ignore[attr-defined] + result_str = result.select(df["a"].dt.to_string("%Y-%m-%dT%H:%M")) + expected = {"a": ["2020-01-01T00:00", "2020-01-02T00:00"]} + compare_dicts(result_str, expected) From 10c99991e3b816533e7e0d50f3dab076961f9e70 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 9 Oct 2024 09:37:40 +0200 Subject: [PATCH 105/145] release: Bump version to 1.9.2 (#1156) * release: Bump version to 1.9.2 * correct xfail version --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- tests/expr_and_series/convert_time_zone_test.py | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 0fc410132..76ff0504f 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.9.1' +'1.9.2' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 3e367460d..03d857d0c 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -59,7 +59,7 @@ from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index -__version__ = "1.9.1" +__version__ = "1.9.2" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index 48cae220d..92866a6a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.9.1" +version = "1.9.2" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] diff --git a/tests/expr_and_series/convert_time_zone_test.py b/tests/expr_and_series/convert_time_zone_test.py index bc5d176ba..ee4ccaec4 100644 --- a/tests/expr_and_series/convert_time_zone_test.py +++ b/tests/expr_and_series/convert_time_zone_test.py @@ -18,7 +18,7 @@ def test_convert_time_zone( constructor: Constructor, request: pytest.FixtureRequest ) -> None: if (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) or ( - "pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2, 2) + "pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2, 1) ): request.applymarker(pytest.mark.xfail) data = { @@ -44,7 +44,7 @@ def test_convert_time_zone_series( any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows() ) or ( "pandas_pyarrow" in str(constructor_eager) - and parse_version(pd.__version__) < (2, 2) + and parse_version(pd.__version__) < (2, 1) ): request.applymarker(pytest.mark.xfail) data = { @@ -70,7 +70,7 @@ def test_convert_time_zone_from_none( (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) or ( "pandas_pyarrow" in str(constructor) - and parse_version(pd.__version__) < (2, 2) + and parse_version(pd.__version__) < (2, 1) ) or ("pyarrow_table" in str(constructor) and parse_version(pa.__version__) < (12,)) ): From 2f75d6466635571d0d229538198630097c37fc37 Mon Sep 17 00:00:00 2001 From: Alessandro Miola <37796412+AlessandroMiola@users.noreply.github.com> Date: Wed, 9 Oct 2024 21:38:02 +0200 Subject: [PATCH 106/145] docs: separate out `DataFrame` and `LazyFrame` in api-completeness (#1157) * docs: separate out df and lf in api-completeness * chore: fix a typo * docs: update mkdocs.yml --- mkdocs.yml | 12 ++--- utils/generate_backend_completeness.py | 74 +++++++++++++++++--------- 2 files changed, 54 insertions(+), 32 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 10c3741ed..75e2b0fd5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -19,12 +19,12 @@ nav: - extending.md - how_it_works.md - Roadmap and related projects: roadmap_and_related.md - # Commented-out until https://github.com/narwhals-dev/narwhals/issues/1004 is addressed - # - API Completeness: - # - api-completeness/index.md - # - Supported DataFrame methods: api-completeness/dataframe.md - # - Supported Expr methods: api-completeness/expr.md - # - Supported Series methods: api-completeness/series.md + - API Completeness: + - api-completeness/index.md + - Supported DataFrame methods: api-completeness/dataframe.md + - Supported LazyFrame methods: api-completeness/lazyframe.md + - Supported Expr methods: api-completeness/expr.md + - Supported Series methods: api-completeness/series.md - API Reference: - api-reference/narwhals.md - api-reference/dataframe.md diff --git a/utils/generate_backend_completeness.py b/utils/generate_backend_completeness.py index 7f2e23ca2..d7d05daa2 100644 --- a/utils/generate_backend_completeness.py +++ b/utils/generate_backend_completeness.py @@ -64,9 +64,43 @@ def parse_module(module_name: str, backend: str, nw_class_name: str) -> list[str return methods_ +def render_table_and_write_to_output( + results: list[pl.DataFrame], title: str, output_filename: str +) -> None: + results = ( + pl.concat(results) # noqa: PD010 + .with_columns(supported=pl.lit(":white_check_mark:")) + .pivot(on="Backend", values="supported", index=["Class", "Method"]) + .filter(pl.col("narwhals").is_not_null()) + .drop("narwhals") + .fill_null(":x:") + .sort("Class", "Method") + ) + + with pl.Config( + tbl_formatting="ASCII_MARKDOWN", + tbl_hide_column_data_types=True, + tbl_hide_dataframe_shape=True, + set_tbl_rows=results.shape[0], + set_tbl_width_chars=1_000, + ): + table = str(results) + + with TEMPLATE_PATH.open(mode="r") as stream: + new_content = Template(stream.read()).render( + {"backend_table": table, "title": title} + ) + + with (DESTINATION_PATH / f"{output_filename}.md").open(mode="w") as destination: + destination.write(new_content) + + return table + + def get_backend_completeness_table() -> None: for module_name in MODULES: results = [] + processed_classes = set() nw_namespace = f"narwhals.{module_name}" @@ -82,7 +116,7 @@ def get_backend_completeness_table() -> None: nw_methods = get_class_methods(nw_class) - narhwals = pl.DataFrame( + narwhals = pl.DataFrame( {"Class": nw_class_name, "Backend": "narwhals", "Method": nw_methods} ) @@ -102,34 +136,22 @@ def get_backend_completeness_table() -> None: for backend in BACKENDS ] - results.extend([narhwals, *backend_methods]) + results.extend([narwhals, *backend_methods]) - results = ( - pl.concat(results) # noqa: PD010 - .with_columns(supported=pl.lit(":white_check_mark:")) - .pivot(on="Backend", values="supported", index=["Class", "Method"]) - .filter(pl.col("narwhals").is_not_null()) - .drop("narwhals") - .fill_null(":x:") - .sort("Class", "Method") - ) + if nw_class_name in {"DataFrame", "LazyFrame"}: + render_table_and_write_to_output( + results=[narwhals, *backend_methods], + title=nw_class_name, + output_filename=nw_class_name.lower(), + ) + processed_classes.add(nw_class_name) - with pl.Config( - tbl_formatting="ASCII_MARKDOWN", - tbl_hide_column_data_types=True, - tbl_hide_dataframe_shape=True, - set_tbl_rows=results.shape[0], - set_tbl_width_chars=1_000, - ): - table = str(results) - - with TEMPLATE_PATH.open(mode="r") as stream: - new_content = Template(stream.read()).render( - {"backend_table": table, "title": module_name.capitalize()} - ) + if processed_classes == {"DataFrame", "LazyFrame"}: + continue - with (DESTINATION_PATH / f"{module_name}.md").open(mode="w") as destination: - destination.write(new_content) + render_table_and_write_to_output( + results=results, title=module_name.capitalize(), output_filename=module_name + ) _ = get_backend_completeness_table() From 1abea24fd837f3ac6ccb2ea6e0de60665259a50b Mon Sep 17 00:00:00 2001 From: Akshay Agrawal Date: Thu, 10 Oct 2024 22:57:42 -0700 Subject: [PATCH 107/145] Used by marimo (#1162) Narwhals is now a required dependency of the marimo notebook. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bcf314a47..44fc31e56 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ Join the party! - [scikit-lego](https://github.com/koaning/scikit-lego) - [scikit-playtime](https://github.com/koaning/scikit-playtime) - [timebasedcv](https://github.com/FBruzzesi/timebasedcv) +- [marimo](https://github.com/marimo-team/marimo) Feel free to add your project to the list if it's missing, and/or [chat with us on Discord](https://discord.gg/V3PqtB4VA4) if you'd like any support. From 6bbf46ce89b1fe69c467a58ca2c223eac8e1787e Mon Sep 17 00:00:00 2001 From: Cheuk Ting Ho Date: Fri, 11 Oct 2024 13:07:34 +0100 Subject: [PATCH 108/145] fix: make Series.scatter always return new series (#1159) --------- Co-authored-by: Marco Alabruzzo Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- narwhals/_pandas_like/dataframe.py | 7 +++++-- narwhals/_pandas_like/series.py | 2 +- narwhals/_polars/series.py | 6 ++++++ narwhals/series.py | 30 ++++++++++++++++++++++-------- tests/series_only/scatter_test.py | 24 ++++++++++++++++++++++++ 5 files changed, 58 insertions(+), 11 deletions(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 6e96fb7ce..6a6292988 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -355,7 +355,6 @@ def with_columns( ) -> Self: index = self._native_frame.index new_columns = evaluate_into_exprs(self, *exprs, **named_exprs) - if not new_columns and len(self) == 0: return self @@ -394,7 +393,11 @@ def with_columns( backend_version=self._backend_version, ) else: - df = self._native_frame.copy(deep=False) + # This is the logic in pandas' DataFrame.assign + if self._backend_version < (2,): # pragma: no cover + df = self._native_frame.copy(deep=True) + else: + df = self._native_frame.copy(deep=False) for s in new_columns: df[s.name] = validate_dataframe_comparand(index, s) return self._from_native_frame(df) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 2532aea8f..2fe53b22a 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -186,7 +186,7 @@ def scatter(self, indices: int | Sequence[int], values: Any) -> Self: implementation=self._implementation, backend_version=self._backend_version, ) - s = self._native_series + s = self._native_series.copy(deep=True) s.iloc[indices] = values s.name = self.name return self._from_native_series(s) diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 078042195..9ca2f7040 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -212,6 +212,12 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: return self._from_native_series(result) + def scatter(self, indices: int | Sequence[int], values: Any) -> Self: + values = extract_native(values) + s = self._native_series.clone() + s.scatter(indices, values) + return self._from_native_series(s) + def value_counts( self: Self, *, diff --git a/narwhals/series.py b/narwhals/series.py index a9eb51a42..0115ac34f 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -139,15 +139,29 @@ def scatter(self, indices: int | Sequence[int], values: Any) -> Self: Set value(s) at given position(s). Arguments: - indices: Position(s) to set items at. - values: Values to set. + indices: Position(s) to set items at. + values: Values to set. - Warning: - For some libraries (pandas, Polars), this method operates in-place, - whereas for others (PyArrow) it doesn't! - We recommend being careful with it, and not relying on the - in-placeness. For example, a valid use case is when updating - a column in an eager dataframe, see the example below. + Note: + This method always returns a new Series, without modifying the original one. + Using this function in a for-loop is an anti-pattern, we recommend building + up your positions and values beforehand and doing an update in one go. + + For example, instead of + + ```python + for i in [1, 3, 2]: + value = some_function(i) + s = s.scatter(i, value) + ``` + + prefer + + ```python + positions = [1, 3, 2] + values = [some_function(x) for x in positions] + s = s.scatter(positions, values) + ``` Examples: >>> import pandas as pd diff --git a/tests/series_only/scatter_test.py b/tests/series_only/scatter_test.py index 2edab2b8c..0677a8dd8 100644 --- a/tests/series_only/scatter_test.py +++ b/tests/series_only/scatter_test.py @@ -24,3 +24,27 @@ def test_scatter(constructor_eager: Any, request: pytest.FixtureRequest) -> None "b": [142, 132, 124], } compare_dicts(result, expected) + + +def test_scatter_unchanged(constructor_eager: Any) -> None: + df = nw.from_native( + constructor_eager({"a": [1, 2, 3], "b": [142, 124, 132]}), eager_only=True + ) + df.with_columns( + df["a"].scatter([0, 1], [999, 888]), df["b"].scatter([0, 2, 1], [142, 124, 132]) + ) + expected = { + "a": [1, 2, 3], + "b": [142, 124, 132], + } + compare_dicts(df, expected) + + +def test_single_series(constructor_eager: Any) -> None: + df = nw.from_native( + constructor_eager({"a": [1, 2, 3], "b": [142, 124, 132]}), eager_only=True + ) + s = df["a"] + s.scatter([0, 1], [999, 888]) + expected = {"a": [1, 2, 3]} + compare_dicts({"a": s}, expected) From 000c360fef499bc8f4169f0759abd7cd2b6ee0ef Mon Sep 17 00:00:00 2001 From: Barret Schloerke Date: Fri, 11 Oct 2024 08:09:02 -0400 Subject: [PATCH 109/145] enh: Add `.rows(named=False)` support for pyarrow (#1155) --- narwhals/_arrow/dataframe.py | 3 +-- tests/frame/rows_test.py | 8 -------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 4bab24a28..a0a4e16cb 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -87,8 +87,7 @@ def rows( self, *, named: bool = False ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: if not named: - msg = "Unnamed rows are not yet supported on PyArrow tables" - raise NotImplementedError(msg) + return list(self.iter_rows(named=False)) # type: ignore[return-value] return self._native_frame.to_pylist() # type: ignore[no-any-return] def iter_rows( diff --git a/tests/frame/rows_test.py b/tests/frame/rows_test.py index 2d94ab18e..744f66065 100644 --- a/tests/frame/rows_test.py +++ b/tests/frame/rows_test.py @@ -92,14 +92,6 @@ def test_rows( expected: list[tuple[Any, ...]] | list[dict[str, Any]], ) -> None: df = nw.from_native(df_raw, eager_only=True) - if isinstance(df_raw, pa.Table) and not named: - with pytest.raises( - NotImplementedError, - match="Unnamed rows are not yet supported on PyArrow tables", - ): - df.rows(named=named) - return - result = df.rows(named=named) assert result == expected From fabad75a0a4dead0e835bd9589c70b12da1cd586 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 11 Oct 2024 14:28:30 +0200 Subject: [PATCH 110/145] release: Bump version to 1.9.3 (#1164) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 76ff0504f..e3cd8f6db 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.9.2' +'1.9.3' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 03d857d0c..4f0b6ae82 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -59,7 +59,7 @@ from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index -__version__ = "1.9.2" +__version__ = "1.9.3" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index 92866a6a8..5c1e6f96b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.9.2" +version = "1.9.3" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From b05653014860d995640a6ea651b569a3b985976d Mon Sep 17 00:00:00 2001 From: Barret Schloerke Date: Sat, 12 Oct 2024 06:08:48 -0400 Subject: [PATCH 111/145] chore: Use shiny's Makefile to allow for test commands to change over time (#1161) * Use shiny's Makefile to allow for test commands to change over time Related: * posit-dev/py-shiny#1729 * posit-dev/py-shiny#1731 * Remove white space * Update step name --- .github/workflows/downstream_tests.yml | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml index 12f7f95b7..d84129d79 100644 --- a/.github/workflows/downstream_tests.yml +++ b/.github/workflows/downstream_tests.yml @@ -113,22 +113,18 @@ jobs: - name: install-basics run: uv pip install --upgrade tox virtualenv setuptools --system - name: install-shiny-dev + env: + UV_SYSTEM_PYTHON: 1 run: | cd py-shiny - uv pip install -e ".[dev,test]" --system + make narwhals-install-shiny - name: install-narwhals-dev run: | uv pip uninstall narwhals --system uv pip install -e . --system - name: show-deps run: uv pip freeze - - name: Run pytest - run: | - cd py-shiny - python tests/pytest/asyncio_prevent.py - pytest - - name: Run mypy + - name: Run `make narwhals-test-integration` run: | cd py-shiny - uv pip install mypy --system - mypy shiny + make narwhals-test-integration From f5cf6388e1eb217d3487c7be28b8480d14b0ab93 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 14 Oct 2024 09:41:39 +0200 Subject: [PATCH 112/145] feat: add `_horizontal` (#1148) * feat: add _horizontal * exception coverage --- docs/api-reference/narwhals.md | 2 + narwhals/__init__.py | 6 +- narwhals/_arrow/namespace.py | 56 +++++++++ narwhals/_dask/namespace.py | 44 ++++++- narwhals/_pandas_like/namespace.py | 54 ++++++++ narwhals/expr.py | 126 +++++++++++++++++++ narwhals/stable/v1/__init__.py | 116 ++++++++++++++++- tests/expr_and_series/all_horizontal_test.py | 10 ++ tests/expr_and_series/max_horizontal_test.py | 25 ++++ tests/expr_and_series/min_horizontal_test.py | 25 ++++ 10 files changed, 461 insertions(+), 3 deletions(-) create mode 100644 tests/expr_and_series/max_horizontal_test.py create mode 100644 tests/expr_and_series/min_horizontal_test.py diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index 39afb8e8b..044b20e0a 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -20,6 +20,7 @@ Here are the top-level functions available in Narwhals. - len - lit - max + - max_horizontal - maybe_align_index - maybe_convert_dtypes - maybe_get_index @@ -28,6 +29,7 @@ Here are the top-level functions available in Narwhals. - mean - mean_horizontal - min + - min_horizontal - narwhalify - new_series - nth diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 4f0b6ae82..124f10c45 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -34,9 +34,11 @@ from narwhals.expr import len_ as len from narwhals.expr import lit from narwhals.expr import max +from narwhals.expr import max_horizontal from narwhals.expr import mean from narwhals.expr import mean_horizontal from narwhals.expr import min +from narwhals.expr import min_horizontal from narwhals.expr import nth from narwhals.expr import sum from narwhals.expr import sum_horizontal @@ -84,10 +86,12 @@ "concat_str", "len", "lit", - "min", "max", + "max_horizontal", "mean", "mean_horizontal", + "min", + "min_horizontal", "nth", "sum", "sum_horizontal", diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index c514110fc..5eae258fc 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -237,6 +237,62 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: output_names=reduce_output_names(parsed_exprs), ) + def min_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: + import pyarrow.compute as pc # ignore-banned-import + + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: ArrowDataFrame) -> list[ArrowSeries]: + init_series, *series = [s for _expr in parsed_exprs for s in _expr._call(df)] + return [ + ArrowSeries( + native_series=reduce( + lambda x, y: pc.min_element_wise(x, y), + [s._native_series for s in series], + init_series._native_series, + ), + name=init_series.name, + backend_version=self._backend_version, + dtypes=self._dtypes, + ) + ] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="min_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + ) + + def max_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: + import pyarrow.compute as pc # ignore-banned-import + + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: ArrowDataFrame) -> list[ArrowSeries]: + init_series, *series = [s for _expr in parsed_exprs for s in _expr._call(df)] + return [ + ArrowSeries( + native_series=reduce( + lambda x, y: pc.max_element_wise(x, y), + [s._native_series for s in series], + init_series._native_series, + ), + name=init_series.name, + backend_version=self._backend_version, + dtypes=self._dtypes, + ) + ] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="max_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + ) + def concat( self, items: Iterable[ArrowDataFrame], diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index e0f2bbbde..23c133a12 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -232,7 +232,7 @@ def concat( ) raise NotImplementedError - def mean_horizontal(self, *exprs: IntoDaskExpr) -> IntoDaskExpr: + def mean_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) def func(df: DaskLazyFrame) -> list[dask_expr.Series]: @@ -256,6 +256,48 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: dtypes=self._dtypes, ) + def min_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: + import dask.dataframe as dd # ignore-banned-import + + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + series = [s for _expr in parsed_exprs for s in _expr._call(df)] + + return [dd.concat(series, axis=1).min(axis=1).rename(series[0].name)] + + return DaskExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="min_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + dtypes=self._dtypes, + ) + + def max_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: + import dask.dataframe as dd # ignore-banned-import + + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + series = [s for _expr in parsed_exprs for s in _expr._call(df)] + + return [dd.concat(series, axis=1).max(axis=1).rename(series[0].name)] + + return DaskExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="max_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + dtypes=self._dtypes, + ) + def _create_expr_from_series(self, _: Any) -> NoReturn: msg = "`_create_expr_from_series` for DaskNamespace exists only for compatibility" raise NotImplementedError(msg) diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 8f6e17518..63d3454a5 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -278,6 +278,60 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: output_names=reduce_output_names(parsed_exprs), ) + def min_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = [s for _expr in parsed_exprs for s in _expr._call(df)] + + return [ + PandasLikeSeries( + native_series=self.concat( + (s.to_frame() for s in series), how="horizontal" + ) + ._native_frame.min(axis=1) + .rename(series[0].name), + implementation=self._implementation, + backend_version=self._backend_version, + dtypes=self._dtypes, + ) + ] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="min_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + ) + + def max_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = [s for _expr in parsed_exprs for s in _expr._call(df)] + + return [ + PandasLikeSeries( + native_series=self.concat( + (s.to_frame() for s in series), how="horizontal" + ) + ._native_frame.max(axis=1) + .rename(series[0].name), + implementation=self._implementation, + backend_version=self._backend_version, + dtypes=self._dtypes, + ) + ] + + return self._create_expr_from_callable( + func=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="max_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + ) + def concat( self, items: Iterable[PandasLikeDataFrame], diff --git a/narwhals/expr.py b/narwhals/expr.py index abf9c1043..edd52b305 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -4221,6 +4221,132 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: ) +def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + """ + Get the minimum value horizontally across columns. + + Notes: + We support `min_horizontal` over numeric columns only. + + Arguments: + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = { + ... "a": [1, 8, 3], + ... "b": [4, 5, None], + ... "c": ["x", "y", "z"], + ... } + + We define a dataframe-agnostic function that computes the horizontal min of "a" + and "b" columns: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.min_horizontal("a", "b")) + + We can then pass either pandas, polars or pyarrow to `func`: + + >>> func(pd.DataFrame(data)) + a + 0 1.0 + 1 5.0 + 2 3.0 + >>> func(pl.DataFrame(data)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 5 │ + │ 3 │ + └─────┘ + >>> func(pa.table(data)) + pyarrow.Table + a: int64 + ---- + a: [[1,5,3]] + """ + if not exprs: + msg = "At least one expression must be passed to `min_horizontal`" + raise ValueError(msg) + return Expr( + lambda plx: plx.min_horizontal( + *[extract_compliant(plx, v) for v in flatten(exprs)] + ) + ) + + +def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + """ + Get the maximum value horizontally across columns. + + Notes: + We support `max_horizontal` over numeric columns only. + + Arguments: + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = { + ... "a": [1, 8, 3], + ... "b": [4, 5, None], + ... "c": ["x", "y", "z"], + ... } + + We define a dataframe-agnostic function that computes the horizontal max of "a" + and "b" columns: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.max_horizontal("a", "b")) + + We can then pass either pandas, polars or pyarrow to `func`: + + >>> func(pd.DataFrame(data)) + a + 0 4.0 + 1 8.0 + 2 3.0 + >>> func(pl.DataFrame(data)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 4 │ + │ 8 │ + │ 3 │ + └─────┘ + >>> func(pa.table(data)) + pyarrow.Table + a: int64 + ---- + a: [[4,8,3]] + """ + if not exprs: + msg = "At least one expression must be passed to `max_horizontal`" + raise ValueError(msg) + return Expr( + lambda plx: plx.max_horizontal( + *[extract_compliant(plx, v) for v in flatten(exprs)] + ) + ) + + class When: def __init__(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> None: self._predicates = flatten([predicates]) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 1c3a3bd1f..e9aac4cf4 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -1527,6 +1527,118 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: return _stableify(nw.mean_horizontal(*exprs)) +def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + """ + Get the minimum value horizontally across columns. + + Notes: + We support `min_horizontal` over numeric columns only. + + Arguments: + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. + + Examples: + >>> import narwhals.stable.v1 as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = { + ... "a": [1, 8, 3], + ... "b": [4, 5, None], + ... "c": ["x", "y", "z"], + ... } + + We define a dataframe-agnostic function that computes the horizontal min of "a" + and "b" columns: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.min_horizontal("a", "b")) + + We can then pass either pandas, polars or pyarrow to `func`: + + >>> func(pd.DataFrame(data)) + a + 0 1.0 + 1 5.0 + 2 3.0 + >>> func(pl.DataFrame(data)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 5 │ + │ 3 │ + └─────┘ + >>> func(pa.table(data)) + pyarrow.Table + a: int64 + ---- + a: [[1,5,3]] + """ + return _stableify(nw.min_horizontal(*exprs)) + + +def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + """ + Get the maximum value horizontally across columns. + + Notes: + We support `max_horizontal` over numeric columns only. + + Arguments: + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. + + Examples: + >>> import narwhals.stable.v1 as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = { + ... "a": [1, 8, 3], + ... "b": [4, 5, None], + ... "c": ["x", "y", "z"], + ... } + + We define a dataframe-agnostic function that computes the horizontal max of "a" + and "b" columns: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.max_horizontal("a", "b")) + + We can then pass either pandas, polars or pyarrow to `func`: + + >>> func(pd.DataFrame(data)) + a + 0 4.0 + 1 8.0 + 2 3.0 + >>> func(pl.DataFrame(data)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 4 │ + │ 8 │ + │ 3 │ + └─────┘ + >>> func(pa.table(data)) + pyarrow.Table + a: int64 + ---- + a: [[4,8,3]] + """ + return _stableify(nw.max_horizontal(*exprs)) + + @overload def concat( items: Iterable[DataFrame[Any]], @@ -2153,10 +2265,12 @@ def from_dict( "nth", "len", "lit", - "min", "max", + "max_horizontal", "mean", "mean_horizontal", + "min", + "min_horizontal", "sum", "sum_horizontal", "when", diff --git a/tests/expr_and_series/all_horizontal_test.py b/tests/expr_and_series/all_horizontal_test.py index 98dc9f9f9..01d53fe63 100644 --- a/tests/expr_and_series/all_horizontal_test.py +++ b/tests/expr_and_series/all_horizontal_test.py @@ -87,3 +87,13 @@ def test_horizontal_expressions_emtpy(constructor: Constructor) -> None: ValueError, match=r"At least one expression must be passed.*sum_horizontal" ): df.select(nw.sum_horizontal()) + + with pytest.raises( + ValueError, match=r"At least one expression must be passed.*max_horizontal" + ): + df.select(nw.max_horizontal()) + + with pytest.raises( + ValueError, match=r"At least one expression must be passed.*min_horizontal" + ): + df.select(nw.min_horizontal()) diff --git a/tests/expr_and_series/max_horizontal_test.py b/tests/expr_and_series/max_horizontal_test.py new file mode 100644 index 000000000..711ce4e0d --- /dev/null +++ b/tests/expr_and_series/max_horizontal_test.py @@ -0,0 +1,25 @@ +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import Constructor +from tests.utils import compare_dicts + +data = {"a": [1, 3, None, None], "b": [4, None, 6, None], "z": [3, 1, None, None]} +expected_values = [4, 3, 6, float("nan")] + + +@pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) +def test_maxh(constructor: Constructor, col_expr: Any) -> None: + df = nw.from_native(constructor(data)) + result = df.select(horizontal_max=nw.max_horizontal(col_expr, nw.col("b"), "z")) + expected = {"horizontal_max": expected_values} + compare_dicts(result, expected) + + +def test_maxh_all(constructor: Constructor) -> None: + df = nw.from_native(constructor(data)) + result = df.select(nw.max_horizontal(nw.all()), c=nw.max_horizontal(nw.all())) + expected = {"a": expected_values, "c": expected_values} + compare_dicts(result, expected) diff --git a/tests/expr_and_series/min_horizontal_test.py b/tests/expr_and_series/min_horizontal_test.py new file mode 100644 index 000000000..ca34d440d --- /dev/null +++ b/tests/expr_and_series/min_horizontal_test.py @@ -0,0 +1,25 @@ +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import Constructor +from tests.utils import compare_dicts + +data = {"a": [1, 3, None, None], "b": [4, None, 6, None], "z": [3, 1, None, None]} +expcted_values = [1, 1, 6, float("nan")] + + +@pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) +def test_minh(constructor: Constructor, col_expr: Any) -> None: + df = nw.from_native(constructor(data)) + result = df.select(horizontal_min=nw.min_horizontal(col_expr, nw.col("b"), "z")) + expected = {"horizontal_min": expcted_values} + compare_dicts(result, expected) + + +def test_minh_all(constructor: Constructor) -> None: + df = nw.from_native(constructor(data)) + result = df.select(nw.min_horizontal(nw.all()), c=nw.min_horizontal(nw.all())) + expected = {"a": expcted_values, "c": expcted_values} + compare_dicts(result, expected) From bcddf07a321c82cd064acb1964bdcd76646b8dfa Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 14 Oct 2024 09:47:49 +0200 Subject: [PATCH 113/145] ci: bump version workflow (#1063) * ci: bump version workflow * Apply suggestions from code review --------- Co-authored-by: Marco Edward Gorelli --- .github/workflows/bump-version.yml | 47 ++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 .github/workflows/bump-version.yml diff --git a/.github/workflows/bump-version.yml b/.github/workflows/bump-version.yml new file mode 100644 index 000000000..b6360cc3d --- /dev/null +++ b/.github/workflows/bump-version.yml @@ -0,0 +1,47 @@ +name: Bump version + +on: + workflow_dispatch: + inputs: + release_type: + description: 'Type of version bump: patch, minor, or major' + required: true + default: 'patch' + type: choice + options: + - patch + - minor + - major + +jobs: + bump-version: + runs-on: ubuntu-latest + steps: + + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: bump version + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git checkout -b bump-version + python utils/bump_version.py ${{ github.event.inputs.release_type }} + + - name: Create pull request + uses: actions/github-script@v6 + if: github.actor == 'MarcoGorelli' || github.actor == 'FBruzzesi' + with: + script: | + const branch = `bump-version` + github.pulls.create({ + owner: context.repo.owner, + repo: context.repo.repo, + head: branch, + base: 'main', + body: 'Bumping the version to reflect the latest release type.', + }) + \ No newline at end of file From 7b63503e44168be9e5290800a6a83039e25c242d Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 14 Oct 2024 09:48:10 +0200 Subject: [PATCH 114/145] feat: add The Zen of Narwhals (#1168) * feat: add The Zen of Narwhals * coverage --- narwhals/this.py | 17 +++++++++++++++++ pyproject.toml | 1 + 2 files changed, 18 insertions(+) create mode 100644 narwhals/this.py diff --git a/narwhals/this.py b/narwhals/this.py new file mode 100644 index 000000000..541ee7704 --- /dev/null +++ b/narwhals/this.py @@ -0,0 +1,17 @@ +# ruff: noqa +ZEN = """\ +⣿⣿⣿⣿⣿⠘⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ THE ZEN OF NARWHALS +⣿⣿⣿⣿⣿⠠⢹⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ Keep it simple +⣿⣿⣿⣿⣿⡀⡄⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ Move slowly and keep things working +⣿⣿⣿⣿⣿⡇⡼⡘⠛⠿⠿⠿⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ A good API is an honest one +⣿⣿⣿⡿⣫⡄⠾⣣⠹⣿⣿⣿⣶⣮⣙⠻⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ Yes, that needs documenting +⣿⣿⢋⣴⣿⣷⣬⣭⣾⣿⣿⣿⣿⣿⣿⣿⣦⡙⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ People learn better from examples +⣿⢃⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⡌⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ than from explanations⠀ +⡏⠀⢰⠄⢻⣿⣿⣿⣿⡿⠋⢉⠻⣿⣿⣿⣿⣿⣿⡜⣿⣿⡿⢁⢻⣿⣿⣿⣿⣿ If in doubt, say 'no'⠀ +⡇⣌⣀⣠⣾⣿⣿⣿⣿⣇⠶⠉⢁⣿⣿⣿⣿⣿⣿⣧⡹⣿⡇⣿⣧⠻⠿⠿⠿⠿ you can always reconsider⠀ +⡧⢹⣿⣿⣿⣜⣟⣸⣿⣿⣷⣶⣿⡿⣿⣿⣝⢿⣿⣿⣷⣬⣥⣿⣿⣿⣿⣿⡟⣰ Yes, we need a test for that +⢡⣆⢻⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣧⡙⣿⣿⡇⣿⣿⣿⣿⠟⣋⣭⣛⠻⣋⣴⣿ If you want users +⣶⣤⣤⣙⠻⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⣦⣍⣡⣿⡿⢋⣴⣿⣿⣿⣿⣿⣿⣿⣿ you need good docs⠀ +⣿⣿⣿⣿⣿⣶⣬⣙⣛⠻⠿⠿⠿⠿⠿⠟⣛⣩⣥⣶⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ Our code is not irreplaceable""" + +print(ZEN) diff --git a/pyproject.toml b/pyproject.toml index 5c1e6f96b..c4974d8c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -130,6 +130,7 @@ plugins = ["covdefaults"] omit = [ 'narwhals/typing.py', 'narwhals/stable/v1/typing.py', + 'narwhals/this.py', # we can run this in every environment that we measure coverage on due to upper-bound constraits 'narwhals/_ibis/*', ] From 25df02f798a2785283be017d5fc0a7a66e34ebe1 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 14 Oct 2024 09:58:41 +0200 Subject: [PATCH 115/145] ci: try fixup bump-version.yml (#1171) --- .github/workflows/bump-version.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/bump-version.yml b/.github/workflows/bump-version.yml index b6360cc3d..3577fd0e5 100644 --- a/.github/workflows/bump-version.yml +++ b/.github/workflows/bump-version.yml @@ -29,6 +29,7 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" git checkout -b bump-version + git remote rename origin upstream python utils/bump_version.py ${{ github.event.inputs.release_type }} - name: Create pull request @@ -44,4 +45,4 @@ jobs: base: 'main', body: 'Bumping the version to reflect the latest release type.', }) - \ No newline at end of file + From 2e7afa5df262ffdbfcb1abe9cca93020445589f8 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 14 Oct 2024 19:07:23 +0200 Subject: [PATCH 116/145] fix: add native_namespace to interchange level of support (#1172) --- narwhals/_duckdb/dataframe.py | 6 +++ narwhals/_duckdb/series.py | 6 +++ narwhals/_ibis/dataframe.py | 7 +++ narwhals/_ibis/series.py | 6 +++ narwhals/_interchange/dataframe.py | 8 +++ narwhals/_interchange/series.py | 9 ++++ .../interchange_native_namespace_test.py | 52 +++++++++++++++++++ 7 files changed, 94 insertions(+) create mode 100644 tests/frame/interchange_native_namespace_test.py diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 555555d4a..5877ed51e 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -4,9 +4,12 @@ from typing import TYPE_CHECKING from typing import Any +from narwhals.dependencies import get_duckdb from narwhals.utils import parse_version if TYPE_CHECKING: + from types import ModuleType + import pandas as pd import pyarrow as pa from typing_extensions import Self @@ -68,6 +71,9 @@ def __init__(self, df: Any, dtypes: DTypes) -> None: def __narwhals_dataframe__(self) -> Any: return self + def __native_namespace__(self: Self) -> ModuleType: + return get_duckdb() # type: ignore[no-any-return] + def __getitem__(self, item: str) -> DuckDBInterchangeSeries: from narwhals._duckdb.series import DuckDBInterchangeSeries diff --git a/narwhals/_duckdb/series.py b/narwhals/_duckdb/series.py index a7dbdd549..e30b14ac9 100644 --- a/narwhals/_duckdb/series.py +++ b/narwhals/_duckdb/series.py @@ -4,8 +4,11 @@ from typing import Any from narwhals._duckdb.dataframe import map_duckdb_dtype_to_narwhals_dtype +from narwhals.dependencies import get_duckdb if TYPE_CHECKING: + from types import ModuleType + from narwhals.typing import DTypes @@ -17,6 +20,9 @@ def __init__(self, df: Any, dtypes: DTypes) -> None: def __narwhals_series__(self) -> Any: return self + def __native_namespace__(self) -> ModuleType: + return get_duckdb() # type: ignore[no-any-return] + def __getattr__(self, attr: str) -> Any: if attr == "dtype": return map_duckdb_dtype_to_narwhals_dtype( diff --git a/narwhals/_ibis/dataframe.py b/narwhals/_ibis/dataframe.py index 8ee01e78b..9d7ebefb0 100644 --- a/narwhals/_ibis/dataframe.py +++ b/narwhals/_ibis/dataframe.py @@ -3,7 +3,11 @@ from typing import TYPE_CHECKING from typing import Any +from narwhals.dependencies import get_ibis + if TYPE_CHECKING: + from types import ModuleType + import pandas as pd import pyarrow as pa from typing_extensions import Self @@ -59,6 +63,9 @@ def __init__(self, df: Any, dtypes: DTypes) -> None: def __narwhals_dataframe__(self) -> Any: return self + def __native_namespace__(self: Self) -> ModuleType: + return get_ibis() # type: ignore[no-any-return] + def __getitem__(self, item: str) -> IbisInterchangeSeries: from narwhals._ibis.series import IbisInterchangeSeries diff --git a/narwhals/_ibis/series.py b/narwhals/_ibis/series.py index 2f6cd6faa..e2c313690 100644 --- a/narwhals/_ibis/series.py +++ b/narwhals/_ibis/series.py @@ -4,8 +4,11 @@ from typing import Any from narwhals._ibis.dataframe import map_ibis_dtype_to_narwhals_dtype +from narwhals.dependencies import get_ibis if TYPE_CHECKING: + from types import ModuleType + from narwhals.typing import DTypes @@ -17,6 +20,9 @@ def __init__(self, df: Any, dtypes: DTypes) -> None: def __narwhals_series__(self) -> Any: return self + def __native_namespace__(self) -> ModuleType: + return get_ibis() # type: ignore[no-any-return] + def __getattr__(self, attr: str) -> Any: if attr == "dtype": return map_ibis_dtype_to_narwhals_dtype( diff --git a/narwhals/_interchange/dataframe.py b/narwhals/_interchange/dataframe.py index 1dc671dc7..4e8e542e7 100644 --- a/narwhals/_interchange/dataframe.py +++ b/narwhals/_interchange/dataframe.py @@ -82,6 +82,14 @@ def __init__(self, df: Any, dtypes: DTypes) -> None: def __narwhals_dataframe__(self) -> Any: return self + def __native_namespace__(self: Self) -> NoReturn: + msg = ( + "Cannot access native namespace for metadata-only dataframes with unknown backend." + "If you would like to see this kind of object supported in Narwhals, please " + "open a feature request at https://github.com/narwhals-dev/narwhals/issues." + ) + raise NotImplementedError(msg) + def __getitem__(self, item: str) -> InterchangeSeries: from narwhals._interchange.series import InterchangeSeries diff --git a/narwhals/_interchange/series.py b/narwhals/_interchange/series.py index 00426e6c0..a9cf41f07 100644 --- a/narwhals/_interchange/series.py +++ b/narwhals/_interchange/series.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING from typing import Any +from typing import NoReturn from narwhals._interchange.dataframe import map_interchange_dtype_to_narwhals_dtype @@ -17,6 +18,14 @@ def __init__(self, df: Any, dtypes: DTypes) -> None: def __narwhals_series__(self) -> Any: return self + def __native_namespace__(self) -> NoReturn: + msg = ( + "Cannot access native namespace for metadata-only series with unknown backend. " + "If you would like to see this kind of object supported in Narwhals, please " + "open a feature request at https://github.com/narwhals-dev/narwhals/issues." + ) + raise NotImplementedError(msg) + def __getattr__(self, attr: str) -> Any: if attr == "dtype": return map_interchange_dtype_to_narwhals_dtype( diff --git a/tests/frame/interchange_native_namespace_test.py b/tests/frame/interchange_native_namespace_test.py new file mode 100644 index 000000000..8a67d07b8 --- /dev/null +++ b/tests/frame/interchange_native_namespace_test.py @@ -0,0 +1,52 @@ +import duckdb +import polars as pl +import pytest + +import narwhals.stable.v1 as nw + +data = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]} + + +def test_interchange() -> None: + df_pl = pl.DataFrame(data) + df = nw.from_native(df_pl.__dataframe__(), eager_or_interchange_only=True) + series = df["a"] + + with pytest.raises( + NotImplementedError, + match="Cannot access native namespace for metadata-only dataframes with unknown backend", + ): + df.__native_namespace__() + + with pytest.raises( + NotImplementedError, + match="Cannot access native namespace for metadata-only series with unknown backend", + ): + series.__native_namespace__() + + +def test_ibis( + tmpdir: pytest.TempdirFactory, +) -> None: # pragma: no cover + ibis = pytest.importorskip("ibis") + df_pl = pl.DataFrame(data) + + filepath = str(tmpdir / "file.parquet") # type: ignore[operator] + df_pl.write_parquet(filepath) + tbl = ibis.read_parquet(filepath) + df = nw.from_native(tbl, eager_or_interchange_only=True) + series = df["a"] + + assert df.__native_namespace__() == ibis + assert series.__native_namespace__() == ibis + + +def test_duckdb() -> None: + df_pl = pl.DataFrame(data) # noqa: F841 + + rel = duckdb.sql("select * from df_pl") + df = nw.from_native(rel, eager_or_interchange_only=True) + series = df["a"] + + assert df.__native_namespace__() == duckdb + assert series.__native_namespace__() == duckdb From 0e12138b9591d804b04bbdcc749e0982d65e00e8 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 14 Oct 2024 19:09:29 +0200 Subject: [PATCH 117/145] test: unxfail test_unary for dask (#1174) --- tests/expr_and_series/unary_test.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index dabab0c03..66afd22af 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -1,15 +1,11 @@ from typing import Any -import pytest - import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import compare_dicts -def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_unary(constructor: Constructor) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} result = ( nw.from_native(constructor(data)) @@ -20,7 +16,8 @@ def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None z_min=nw.col("z").min(), z_max=nw.col("z").max(), ) - .select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique()) + .unique(["a_mean", "a_sum", "b_nunique", "z_min", "z_max"]) + .select(["a_mean", "a_sum", "b_nunique", "z_min", "z_max"]) ) expected = { "a_mean": [2], From 144111936025a72be98463d606e80e032b69b1c1 Mon Sep 17 00:00:00 2001 From: ChinoUkaegbu <77782533+ChinoUkaegbu@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:08:16 +0100 Subject: [PATCH 118/145] Delete tpch/notebooks directory (#1180) --- tpch/notebooks/q1/execute.ipynb | 441 ----------------- tpch/notebooks/q1/kernel-metadata.json | 15 - tpch/notebooks/q10/execute.ipynb | 385 --------------- tpch/notebooks/q10/kernel-metadata.json | 15 - tpch/notebooks/q11/execute.ipynb | 379 --------------- tpch/notebooks/q11/kernel-metadata.json | 15 - tpch/notebooks/q15/execute.ipynb | 366 -------------- tpch/notebooks/q15/kernel-metadata.json | 15 - tpch/notebooks/q17/execute.ipynb | 357 -------------- tpch/notebooks/q17/kernel-metadata.json | 15 - tpch/notebooks/q18/execute.ipynb | 210 --------- tpch/notebooks/q18/kernel-metadata.json | 15 - tpch/notebooks/q19/execute.ipynb | 373 --------------- tpch/notebooks/q19/kernel-metadata.json | 15 - tpch/notebooks/q2/execute.ipynb | 474 ------------------- tpch/notebooks/q2/kernel-metadata.json | 15 - tpch/notebooks/q20/execute.ipynb | 382 --------------- tpch/notebooks/q20/kernel-metadata.json | 15 - tpch/notebooks/q21/execute.ipynb | 450 ------------------ tpch/notebooks/q21/kernel-metadata.json | 15 - tpch/notebooks/q3/execute.ipynb | 566 ---------------------- tpch/notebooks/q3/kernel-metadata.json | 15 - tpch/notebooks/q4/execute.ipynb | 507 -------------------- tpch/notebooks/q4/kernel-metadata.json | 15 - tpch/notebooks/q5/execute.ipynb | 543 --------------------- tpch/notebooks/q5/kernel-metadata.json | 15 - tpch/notebooks/q6/execute.ipynb | 516 -------------------- tpch/notebooks/q6/kernel-metadata.json | 15 - tpch/notebooks/q7/execute.ipynb | 603 ------------------------ tpch/notebooks/q7/kernel-metadata.json | 15 - tpch/notebooks/q9/execute.ipynb | 378 --------------- tpch/notebooks/q9/kernel-metadata.json | 15 - 32 files changed, 7170 deletions(-) delete mode 100755 tpch/notebooks/q1/execute.ipynb delete mode 100644 tpch/notebooks/q1/kernel-metadata.json delete mode 100644 tpch/notebooks/q10/execute.ipynb delete mode 100644 tpch/notebooks/q10/kernel-metadata.json delete mode 100644 tpch/notebooks/q11/execute.ipynb delete mode 100644 tpch/notebooks/q11/kernel-metadata.json delete mode 100644 tpch/notebooks/q15/execute.ipynb delete mode 100644 tpch/notebooks/q15/kernel-metadata.json delete mode 100644 tpch/notebooks/q17/execute.ipynb delete mode 100644 tpch/notebooks/q17/kernel-metadata.json delete mode 100644 tpch/notebooks/q18/execute.ipynb delete mode 100644 tpch/notebooks/q18/kernel-metadata.json delete mode 100644 tpch/notebooks/q19/execute.ipynb delete mode 100644 tpch/notebooks/q19/kernel-metadata.json delete mode 100755 tpch/notebooks/q2/execute.ipynb delete mode 100644 tpch/notebooks/q2/kernel-metadata.json delete mode 100644 tpch/notebooks/q20/execute.ipynb delete mode 100644 tpch/notebooks/q20/kernel-metadata.json delete mode 100755 tpch/notebooks/q21/execute.ipynb delete mode 100644 tpch/notebooks/q21/kernel-metadata.json delete mode 100755 tpch/notebooks/q3/execute.ipynb delete mode 100644 tpch/notebooks/q3/kernel-metadata.json delete mode 100755 tpch/notebooks/q4/execute.ipynb delete mode 100644 tpch/notebooks/q4/kernel-metadata.json delete mode 100755 tpch/notebooks/q5/execute.ipynb delete mode 100644 tpch/notebooks/q5/kernel-metadata.json delete mode 100755 tpch/notebooks/q6/execute.ipynb delete mode 100644 tpch/notebooks/q6/kernel-metadata.json delete mode 100755 tpch/notebooks/q7/execute.ipynb delete mode 100644 tpch/notebooks/q7/kernel-metadata.json delete mode 100644 tpch/notebooks/q9/execute.ipynb delete mode 100644 tpch/notebooks/q9/kernel-metadata.json diff --git a/tpch/notebooks/q1/execute.ipynb b/tpch/notebooks/q1/execute.ipynb deleted file mode 100755 index de9c52baa..000000000 --- a/tpch/notebooks/q1/execute.ipynb +++ /dev/null @@ -1,441 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals dask[dataframe]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "@nw.narwhalify\n", - "def q1(lineitem_ds: Any) -> Any:\n", - " var_1 = datetime(1998, 9, 2)\n", - " return (\n", - " lineitem_ds.filter(nw.col(\"l_shipdate\") <= var_1)\n", - " .with_columns(\n", - " disc_price=nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")),\n", - " charge=(\n", - " nw.col(\"l_extendedprice\")\n", - " * (1.0 - nw.col(\"l_discount\"))\n", - " * (1.0 + nw.col(\"l_tax\"))\n", - " ),\n", - " )\n", - " .group_by(\"l_returnflag\", \"l_linestatus\")\n", - " .agg(\n", - " nw.col(\"l_quantity\").sum().alias(\"sum_qty\"),\n", - " nw.col(\"l_extendedprice\").sum().alias(\"sum_base_price\"),\n", - " nw.col(\"disc_price\").sum().alias(\"sum_disc_price\"),\n", - " nw.col(\"charge\").sum().alias(\"sum_charge\"),\n", - " nw.col(\"l_quantity\").mean().alias(\"avg_qty\"),\n", - " nw.col(\"l_extendedprice\").mean().alias(\"avg_price\"),\n", - " nw.col(\"l_discount\").mean().alias(\"avg_disc\"),\n", - " nw.len().alias(\"count_order\"),\n", - " )\n", - " .sort(\"l_returnflag\", \"l_linestatus\")\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + \"region.parquet\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "customer = dir_ + \"customer.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "orders = dir_ + \"orders.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"\n", - "part = dir_ + \"part.parquet\"\n", - "partsupp = dir_ + \"partsupp.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import dask.dataframe as dd\n", - "import pyarrow.parquet as pq\n", - "\n", - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - " \"pyarrow\": lambda x: pq.read_table(x),\n", - " \"dask\": lambda x: dd.read_parquet(x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "id": "6", - "metadata": {}, - "source": [ - "## PyArrow.table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pyarrow\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q1(fn(lineitem))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q1(lineitem_ds=fn(lineitem))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q1(fn(lineitem))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.700300", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.706350", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q1(fn(lineitem))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q1(fn(lineitem)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "16", - "metadata": {}, - "source": [ - "## Dask Dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"dask\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q1(fn(lineitem)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "18", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796716, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796952, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796969, - "sourceType": "kernelVersion" - } - ], - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tpch/notebooks/q1/kernel-metadata.json b/tpch/notebooks/q1/kernel-metadata.json deleted file mode 100644 index 6aa41f669..000000000 --- a/tpch/notebooks/q1/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q1-s2", - "title": "Narwhals TPCH Q1 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q10/execute.ipynb b/tpch/notebooks/q10/execute.ipynb deleted file mode 100644 index 9ff211773..000000000 --- a/tpch/notebooks/q10/execute.ipynb +++ /dev/null @@ -1,385 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q10(\n", - " customer_ds_raw: Any,\n", - " nation_ds_raw: Any,\n", - " lineitem_ds_raw: Any,\n", - " orders_ds_raw: Any,\n", - ") -> Any:\n", - " nation_ds = nw.from_native(nation_ds_raw)\n", - " line_item_ds = nw.from_native(lineitem_ds_raw)\n", - " orders_ds = nw.from_native(orders_ds_raw)\n", - " customer_ds = nw.from_native(customer_ds_raw)\n", - "\n", - " var1 = datetime(1993, 10, 1)\n", - " var2 = datetime(1994, 1, 1)\n", - "\n", - " result = (\n", - " customer_ds.join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", - " .join(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", - " .join(nation_ds, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", - " .filter(nw.col(\"o_orderdate\").is_between(var1, var2, closed=\"left\"))\n", - " .filter(nw.col(\"l_returnflag\") == \"R\")\n", - " .with_columns(\n", - " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"revenue\")\n", - " )\n", - " .group_by(\n", - " \"c_custkey\",\n", - " \"c_name\",\n", - " \"c_acctbal\",\n", - " \"c_phone\",\n", - " \"n_name\",\n", - " \"c_address\",\n", - " \"c_comment\",\n", - " )\n", - " .agg(nw.sum(\"revenue\"))\n", - " .select(\n", - " \"c_custkey\",\n", - " \"c_name\",\n", - " \"revenue\",\n", - " \"c_acctbal\",\n", - " \"n_name\",\n", - " \"c_address\",\n", - " \"c_phone\",\n", - " \"c_comment\",\n", - " )\n", - " .sort(by=\"revenue\", descending=True)\n", - " .head(20)\n", - " )\n", - "\n", - " return nw.to_native(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "orders = dir_ + \"orders.parquet\"\n", - "customer = dir_ + \"customer.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.7003", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.70635", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - } - ], - "dockerImageVersionId": 30673, - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tpch/notebooks/q10/kernel-metadata.json b/tpch/notebooks/q10/kernel-metadata.json deleted file mode 100644 index 5fa48ff5e..000000000 --- a/tpch/notebooks/q10/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q10-s2", - "title": "Narwhals TPCH Q10 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q11/execute.ipynb b/tpch/notebooks/q11/execute.ipynb deleted file mode 100644 index f5bbc0f9c..000000000 --- a/tpch/notebooks/q11/execute.ipynb +++ /dev/null @@ -1,379 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q11(\n", - " partsupp_ds_raw: Any,\n", - " nation_ds_raw: Any,\n", - " supplier_ds_raw: Any,\n", - ") -> Any:\n", - " nation_ds = nw.from_native(nation_ds_raw)\n", - " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", - " supplier_ds = nw.from_native(supplier_ds_raw)\n", - "\n", - " var1 = \"GERMANY\"\n", - " var2 = 0.0001\n", - "\n", - " q1 = (\n", - " partsupp_ds.join(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n", - " .join(nation_ds, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", - " .filter(nw.col(\"n_name\") == var1)\n", - " )\n", - " q2 = q1.select(\n", - " (nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\")).sum().round(2).alias(\"tmp\")\n", - " * var2\n", - " )\n", - "\n", - " q_final = (\n", - " q1.with_columns((nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\")).alias(\"value\"))\n", - " .group_by(\"ps_partkey\")\n", - " .agg(nw.sum(\"value\"))\n", - " .join(q2, how=\"cross\")\n", - " .filter(nw.col(\"value\") > nw.col(\"tmp\"))\n", - " .select(\"ps_partkey\", \"value\")\n", - " .sort(\"value\", descending=True)\n", - " )\n", - "\n", - " return nw.to_native(q_final)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"\n", - "partsupp = dir_ + \"partsupp.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.7003", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.70635", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint\n", - "\n", - "pprint(results)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - } - ], - "dockerImageVersionId": 30673, - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tpch/notebooks/q11/kernel-metadata.json b/tpch/notebooks/q11/kernel-metadata.json deleted file mode 100644 index 2aa47c6c9..000000000 --- a/tpch/notebooks/q11/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q11-s2", - "title": "Narwhals TPCH Q11 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q15/execute.ipynb b/tpch/notebooks/q15/execute.ipynb deleted file mode 100644 index d108a7196..000000000 --- a/tpch/notebooks/q15/execute.ipynb +++ /dev/null @@ -1,366 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q15(\n", - " lineitem_ds_raw: Any,\n", - " supplier_ds_raw: Any,\n", - ") -> Any:\n", - " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", - " supplier_ds = nw.from_native(supplier_ds_raw)\n", - "\n", - " var1 = datetime(1996, 1, 1)\n", - " var2 = datetime(1996, 4, 1)\n", - "\n", - " revenue = (\n", - " lineitem_ds.filter(nw.col(\"l_shipdate\").is_between(var1, var2, closed=\"left\"))\n", - " .with_columns(\n", - " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\n", - " \"total_revenue\"\n", - " )\n", - " )\n", - " .group_by(\"l_suppkey\")\n", - " .agg(nw.sum(\"total_revenue\"))\n", - " .select(nw.col(\"l_suppkey\").alias(\"supplier_no\"), nw.col(\"total_revenue\"))\n", - " )\n", - "\n", - " result = (\n", - " supplier_ds.join(revenue, left_on=\"s_suppkey\", right_on=\"supplier_no\")\n", - " .filter(nw.col(\"total_revenue\") == nw.col(\"total_revenue\").max())\n", - " .with_columns(nw.col(\"total_revenue\").round(2))\n", - " .select(\"s_suppkey\", \"s_name\", \"s_address\", \"s_phone\", \"total_revenue\")\n", - " .sort(\"s_suppkey\")\n", - " )\n", - "\n", - " return nw.to_native(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.7003", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.70635", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q15(fn(lineitem), fn(supplier)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - } - ], - "dockerImageVersionId": 30673, - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tpch/notebooks/q15/kernel-metadata.json b/tpch/notebooks/q15/kernel-metadata.json deleted file mode 100644 index e552c9477..000000000 --- a/tpch/notebooks/q15/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q15-s2", - "title": "Narwhals TPCH Q15 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q17/execute.ipynb b/tpch/notebooks/q17/execute.ipynb deleted file mode 100644 index 4d012f088..000000000 --- a/tpch/notebooks/q17/execute.ipynb +++ /dev/null @@ -1,357 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q17(lineitem_ds_raw: Any, part_ds_raw: Any) -> Any:\n", - " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", - " part_ds = nw.from_native(part_ds_raw)\n", - "\n", - " var1 = \"Brand#23\"\n", - " var2 = \"MED BOX\"\n", - "\n", - " query1 = (\n", - " part_ds.filter(nw.col(\"p_brand\") == var1)\n", - " .filter(nw.col(\"p_container\") == var2)\n", - " .join(lineitem_ds, how=\"left\", left_on=\"p_partkey\", right_on=\"l_partkey\")\n", - " )\n", - "\n", - " final_query = (\n", - " query1.group_by(\"p_partkey\")\n", - " .agg((0.2 * nw.col(\"l_quantity\").mean()).alias(\"avg_quantity\"))\n", - " .select(nw.col(\"p_partkey\").alias(\"key\"), nw.col(\"avg_quantity\"))\n", - " .join(query1, left_on=\"key\", right_on=\"p_partkey\")\n", - " .filter(nw.col(\"l_quantity\") < nw.col(\"avg_quantity\"))\n", - " .select((nw.col(\"l_extendedprice\").sum() / 7.0).round(2).alias(\"avg_yearly\"))\n", - " )\n", - "\n", - " return nw.to_native(final_query)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "part = dir_ + \"part.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.7003", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.70635", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q17(fn(lineitem), fn(part)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - } - ], - "dockerImageVersionId": 30673, - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tpch/notebooks/q17/kernel-metadata.json b/tpch/notebooks/q17/kernel-metadata.json deleted file mode 100644 index 0fd73368c..000000000 --- a/tpch/notebooks/q17/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q17-s2", - "title": "Narwhals TPCH Q17 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q18/execute.ipynb b/tpch/notebooks/q18/execute.ipynb deleted file mode 100644 index edf635d9e..000000000 --- a/tpch/notebooks/q18/execute.ipynb +++ /dev/null @@ -1,210 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q18(customer_ds_raw: Any, lineitem_ds_raw: Any, orders_ds_raw: Any) -> Any:\n", - " customer_ds = nw.from_native(customer_ds_raw)\n", - " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", - " orders_ds = nw.from_native(orders_ds_raw)\n", - "\n", - " var1 = 300\n", - "\n", - " query1 = (\n", - " lineitem_ds.group_by(\"l_orderkey\")\n", - " .agg(nw.col(\"l_quantity\").sum().alias(\"sum_quantity\"))\n", - " .filter(nw.col(\"sum_quantity\") > var1)\n", - " )\n", - "\n", - " q_final = (\n", - " orders_ds.join(query1, left_on=\"o_orderkey\", right_on=\"l_orderkey\", how=\"semi\")\n", - " .join(lineitem_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", - " .join(customer_ds, left_on=\"o_custkey\", right_on=\"c_custkey\")\n", - " .group_by(\"c_name\", \"o_custkey\", \"o_orderkey\", \"o_orderdate\", \"o_totalprice\")\n", - " .agg(nw.col(\"l_quantity\").sum().alias(\"col6\"))\n", - " .select(\n", - " nw.col(\"c_name\"),\n", - " nw.col(\"o_custkey\").alias(\"c_custkey\"),\n", - " nw.col(\"o_orderkey\"),\n", - " nw.col(\"o_orderdate\").alias(\"o_orderdat\"),\n", - " nw.col(\"o_totalprice\"),\n", - " nw.col(\"col6\"),\n", - " )\n", - " .sort(by=[\"o_totalprice\", \"o_orderdat\"], descending=[True, False])\n", - " .head(100)\n", - " )\n", - "\n", - " return nw.to_native(q_final)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "customer = dir_ + \"customer.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "orders = dir_ + \"orders.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tpch/notebooks/q18/kernel-metadata.json b/tpch/notebooks/q18/kernel-metadata.json deleted file mode 100644 index e1c11e53c..000000000 --- a/tpch/notebooks/q18/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q18-s2", - "title": "Narwhals TPCH Q18 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q19/execute.ipynb b/tpch/notebooks/q19/execute.ipynb deleted file mode 100644 index 8860cc773..000000000 --- a/tpch/notebooks/q19/execute.ipynb +++ /dev/null @@ -1,373 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q19(lineitem_ds_raw: Any, part_ds_raw: Any) -> Any:\n", - " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", - " part_ds = nw.from_native(part_ds_raw)\n", - "\n", - " result = (\n", - " part_ds.join(lineitem_ds, left_on=\"p_partkey\", right_on=\"l_partkey\")\n", - " .filter(nw.col(\"l_shipmode\").is_in([\"AIR\", \"AIR REG\"]))\n", - " .filter(nw.col(\"l_shipinstruct\") == \"DELIVER IN PERSON\")\n", - " .filter(\n", - " (\n", - " (nw.col(\"p_brand\") == \"Brand#12\")\n", - " & nw.col(\"p_container\").is_in([\"SM CASE\", \"SM BOX\", \"SM PACK\", \"SM PKG\"])\n", - " & (nw.col(\"l_quantity\").is_between(1, 11))\n", - " & (nw.col(\"p_size\").is_between(1, 5))\n", - " )\n", - " | (\n", - " (nw.col(\"p_brand\") == \"Brand#23\")\n", - " & nw.col(\"p_container\").is_in(\n", - " [\"MED BAG\", \"MED BOX\", \"MED PKG\", \"MED PACK\"]\n", - " )\n", - " & (nw.col(\"l_quantity\").is_between(10, 20))\n", - " & (nw.col(\"p_size\").is_between(1, 10))\n", - " )\n", - " | (\n", - " (nw.col(\"p_brand\") == \"Brand#34\")\n", - " & nw.col(\"p_container\").is_in([\"LG CASE\", \"LG BOX\", \"LG PACK\", \"LG PKG\"])\n", - " & (nw.col(\"l_quantity\").is_between(20, 30))\n", - " & (nw.col(\"p_size\").is_between(1, 15))\n", - " )\n", - " )\n", - " .select(\n", - " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n", - " .sum()\n", - " .round(2)\n", - " .alias(\"revenue\")\n", - " )\n", - " )\n", - "\n", - " return nw.to_native(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "part = dir_ + \"part.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.7003", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.70635", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q19(fn(lineitem), fn(part)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - } - ], - "dockerImageVersionId": 30673, - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tpch/notebooks/q19/kernel-metadata.json b/tpch/notebooks/q19/kernel-metadata.json deleted file mode 100644 index b250ac0a4..000000000 --- a/tpch/notebooks/q19/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q19-s2", - "title": "Narwhals TPCH Q19 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q2/execute.ipynb b/tpch/notebooks/q2/execute.ipynb deleted file mode 100755 index 74ba50f2a..000000000 --- a/tpch/notebooks/q2/execute.ipynb +++ /dev/null @@ -1,474 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow dask[dataframe]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install git+https://github.com/MarcoGorelli/narwhals.git@more-dask-tpch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "@nw.narwhalify\n", - "def q2(\n", - " region_ds: Any,\n", - " nation_ds: Any,\n", - " supplier_ds: Any,\n", - " part_ds: Any,\n", - " part_supp_ds: Any,\n", - ") -> Any:\n", - " var_1 = 15\n", - " var_2 = \"BRASS\"\n", - " var_3 = \"EUROPE\"\n", - "\n", - " result_q2 = (\n", - " part_ds.join(part_supp_ds, left_on=\"p_partkey\", right_on=\"ps_partkey\")\n", - " .join(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n", - " .join(nation_ds, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", - " .join(region_ds, left_on=\"n_regionkey\", right_on=\"r_regionkey\")\n", - " .filter(\n", - " nw.col(\"p_size\") == var_1,\n", - " nw.col(\"p_type\").str.ends_with(var_2),\n", - " nw.col(\"r_name\") == var_3,\n", - " )\n", - " )\n", - "\n", - " final_cols = [\n", - " \"s_acctbal\",\n", - " \"s_name\",\n", - " \"n_name\",\n", - " \"p_partkey\",\n", - " \"p_mfgr\",\n", - " \"s_address\",\n", - " \"s_phone\",\n", - " \"s_comment\",\n", - " ]\n", - "\n", - " return (\n", - " result_q2.group_by(\"p_partkey\")\n", - " .agg(nw.col(\"ps_supplycost\").min().alias(\"ps_supplycost\"))\n", - " .join(\n", - " result_q2,\n", - " left_on=[\"p_partkey\", \"ps_supplycost\"],\n", - " right_on=[\"p_partkey\", \"ps_supplycost\"],\n", - " )\n", - " .select(final_cols)\n", - " .sort(\n", - " [\"s_acctbal\", \"n_name\", \"s_name\", \"p_partkey\"],\n", - " descending=[True, False, False, False],\n", - " )\n", - " .head(100)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + \"region.parquet\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "customer = dir_ + \"customer.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "orders = dir_ + \"orders.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"\n", - "part = dir_ + \"part.parquet\"\n", - "partsupp = dir_ + \"partsupp.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import dask.dataframe as dd\n", - "import pyarrow.parquet as pq\n", - "\n", - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - " \"pyarrow\": lambda x: pq.read_table(x),\n", - " \"dask\": lambda x: dd.read_parquet(x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "id": "7", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8", - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "9", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10", - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "11", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.700300", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12", - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.706350", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "13", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14", - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "15", - "metadata": {}, - "source": [ - "## PyArrow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pyarrow\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "17", - "metadata": {}, - "source": [ - "## Dask" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"dask\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp)).compute()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "19", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796716, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796952, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796969, - "sourceType": "kernelVersion" - } - ], - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tpch/notebooks/q2/kernel-metadata.json b/tpch/notebooks/q2/kernel-metadata.json deleted file mode 100644 index 96386cbff..000000000 --- a/tpch/notebooks/q2/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q2-s2", - "title": "Narwhals TPCH Q2 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q20/execute.ipynb b/tpch/notebooks/q20/execute.ipynb deleted file mode 100644 index a9698c1ad..000000000 --- a/tpch/notebooks/q20/execute.ipynb +++ /dev/null @@ -1,382 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q20(\n", - " part_ds_raw: Any,\n", - " partsupp_ds_raw: Any,\n", - " nation_ds_raw: Any,\n", - " lineitem_ds_raw: Any,\n", - " supplier_ds_raw: Any,\n", - ") -> Any:\n", - " part_ds = nw.from_native(part_ds_raw)\n", - " nation_ds = nw.from_native(nation_ds_raw)\n", - " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", - " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", - " supplier_ds = nw.from_native(supplier_ds_raw)\n", - "\n", - " var1 = datetime(1994, 1, 1)\n", - " var2 = datetime(1995, 1, 1)\n", - " var3 = \"CANADA\"\n", - " var4 = \"forest\"\n", - "\n", - " query1 = (\n", - " lineitem_ds.filter(nw.col(\"l_shipdate\").is_between(var1, var2, closed=\"left\"))\n", - " .group_by(\"l_partkey\", \"l_suppkey\")\n", - " .agg((nw.col(\"l_quantity\").sum()).alias(\"sum_quantity\"))\n", - " .with_columns(sum_quantity=nw.col(\"sum_quantity\") * 0.5)\n", - " )\n", - " query2 = nation_ds.filter(nw.col(\"n_name\") == var3)\n", - " query3 = supplier_ds.join(query2, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", - "\n", - " result = (\n", - " part_ds.filter(nw.col(\"p_name\").str.starts_with(var4))\n", - " .select(nw.col(\"p_partkey\").unique())\n", - " .join(partsupp_ds, left_on=\"p_partkey\", right_on=\"ps_partkey\")\n", - " .join(\n", - " query1,\n", - " left_on=[\"ps_suppkey\", \"p_partkey\"],\n", - " right_on=[\"l_suppkey\", \"l_partkey\"],\n", - " )\n", - " .filter(nw.col(\"ps_availqty\") > nw.col(\"sum_quantity\"))\n", - " .select(nw.col(\"ps_suppkey\").unique())\n", - " .join(query3, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n", - " .select(\"s_name\", \"s_address\")\n", - " .sort(\"s_name\")\n", - " )\n", - "\n", - " return nw.to_native(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"\n", - "part = dir_ + \"part.parquet\"\n", - "partsupp = dir_ + \"partsupp.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.7003", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.70635", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - } - ], - "dockerImageVersionId": 30673, - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tpch/notebooks/q20/kernel-metadata.json b/tpch/notebooks/q20/kernel-metadata.json deleted file mode 100644 index e6733375e..000000000 --- a/tpch/notebooks/q20/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q20-s2", - "title": "Narwhals TPCH Q20 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q21/execute.ipynb b/tpch/notebooks/q21/execute.ipynb deleted file mode 100755 index af12a424c..000000000 --- a/tpch/notebooks/q21/execute.ipynb +++ /dev/null @@ -1,450 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": { - "papermill": { - "duration": 56.056574, - "end_time": "2024-06-20T09:39:32.181310", - "exception": false, - "start_time": "2024-06-20T09:38:36.124736", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": { - "papermill": { - "duration": 0.710694, - "end_time": "2024-06-20T09:39:32.905246", - "exception": false, - "start_time": "2024-06-20T09:39:32.194552", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "import narwhals as nw\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": { - "papermill": { - "duration": 0.024919, - "end_time": "2024-06-20T09:39:32.942659", - "exception": false, - "start_time": "2024-06-20T09:39:32.917740", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "Q_NUM = 21\n", - "\n", - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": { - "papermill": { - "duration": 0.034054, - "end_time": "2024-06-20T09:39:32.990409", - "exception": false, - "start_time": "2024-06-20T09:39:32.956355", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "def q21(\n", - " lineitem_raw: Any,\n", - " nation_raw: Any,\n", - " orders_raw: Any,\n", - " supplier_raw: Any,\n", - ") -> Any:\n", - " lineitem = nw.from_native(lineitem_raw)\n", - " nation = nw.from_native(nation_raw)\n", - " orders = nw.from_native(orders_raw)\n", - " supplier = nw.from_native(supplier_raw)\n", - "\n", - " var1 = \"SAUDI ARABIA\"\n", - "\n", - " q1 = (\n", - " lineitem.group_by(\"l_orderkey\")\n", - " .agg(nw.len().alias(\"n_supp_by_order\"))\n", - " .filter(nw.col(\"n_supp_by_order\") > 1)\n", - " .join(\n", - " lineitem.filter(nw.col(\"l_receiptdate\") > nw.col(\"l_commitdate\")),\n", - " left_on=\"l_orderkey\",\n", - " right_on=\"l_orderkey\",\n", - " )\n", - " )\n", - "\n", - " q_final = (\n", - " q1.group_by(\"l_orderkey\")\n", - " .agg(nw.len().alias(\"n_supp_by_order\"))\n", - " .join(q1, left_on=\"l_orderkey\", right_on=\"l_orderkey\")\n", - " .join(supplier, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", - " .join(nation, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", - " .join(orders, left_on=\"l_orderkey\", right_on=\"o_orderkey\")\n", - " .filter(nw.col(\"n_supp_by_order\") == 1)\n", - " .filter(nw.col(\"n_name\") == var1)\n", - " .filter(nw.col(\"o_orderstatus\") == \"F\")\n", - " .group_by(\"s_name\")\n", - " .agg(nw.len().alias(\"numwait\"))\n", - " .sort(by=[\"numwait\", \"s_name\"], descending=[True, False])\n", - " .head(100)\n", - " )\n", - " return nw.to_native(q_final)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": { - "papermill": { - "duration": 0.023895, - "end_time": "2024-06-20T09:39:33.027127", - "exception": false, - "start_time": "2024-06-20T09:39:33.003232", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "orders = dir_ + \"orders.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": { - "papermill": { - "duration": 0.023307, - "end_time": "2024-06-20T09:39:33.064042", - "exception": false, - "start_time": "2024-06-20T09:39:33.040735", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "id": "6", - "metadata": { - "papermill": { - "duration": 0.013315, - "end_time": "2024-06-20T09:39:33.090039", - "exception": false, - "start_time": "2024-06-20T09:39:33.076724", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": { - "papermill": { - "duration": 153.948657, - "end_time": "2024-06-20T09:42:07.051286", - "exception": false, - "start_time": "2024-06-20T09:39:33.102629", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", - " fn(lineitem),\n", - " fn(nation),\n", - " fn(orders),\n", - " fn(supplier),\n", - ")\n", - "\n", - "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": { - "papermill": { - "duration": 0.012282, - "end_time": "2024-06-20T09:42:07.077095", - "exception": false, - "start_time": "2024-06-20T09:42:07.064813", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": { - "papermill": { - "duration": 138.36459, - "end_time": "2024-06-20T09:44:25.454188", - "exception": false, - "start_time": "2024-06-20T09:42:07.089598", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", - " fn(lineitem),\n", - " fn(nation),\n", - " fn(orders),\n", - " fn(supplier),\n", - ")\n", - "\n", - "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": { - "papermill": { - "duration": 0.012075, - "end_time": "2024-06-20T09:44:25.478725", - "exception": false, - "start_time": "2024-06-20T09:44:25.466650", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": { - "papermill": { - "duration": 89.487638, - "end_time": "2024-06-20T09:45:54.978758", - "exception": false, - "start_time": "2024-06-20T09:44:25.491120", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", - " fn(lineitem),\n", - " fn(nation),\n", - " fn(orders),\n", - " fn(supplier),\n", - ")\n", - "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": { - "papermill": { - "duration": 0.011745, - "end_time": "2024-06-20T09:45:55.002962", - "exception": false, - "start_time": "2024-06-20T09:45:54.991217", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": { - "papermill": { - "duration": 23.550631, - "end_time": "2024-06-20T09:46:18.566271", - "exception": false, - "start_time": "2024-06-20T09:45:55.015640", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", - " fn(lineitem),\n", - " fn(nation),\n", - " fn(orders),\n", - " fn(supplier),\n", - ")\n", - "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": { - "papermill": { - "duration": 0.012321, - "end_time": "2024-06-20T09:46:18.591255", - "exception": false, - "start_time": "2024-06-20T09:46:18.578934", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": { - "papermill": { - "duration": 0.023933, - "end_time": "2024-06-20T09:46:18.627709", - "exception": false, - "start_time": "2024-06-20T09:46:18.603776", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - } - ], - "dockerImageVersionId": 30673, - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 466.953302, - "end_time": "2024-06-20T09:46:19.507794", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-06-20T09:38:32.554492", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tpch/notebooks/q21/kernel-metadata.json b/tpch/notebooks/q21/kernel-metadata.json deleted file mode 100644 index b9e230e46..000000000 --- a/tpch/notebooks/q21/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q21-s2", - "title": "Narwhals TPCH Q21 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q3/execute.ipynb b/tpch/notebooks/q3/execute.ipynb deleted file mode 100755 index b81135fc3..000000000 --- a/tpch/notebooks/q3/execute.ipynb +++ /dev/null @@ -1,566 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import date\n", - "from typing import Any\n", - "\n", - "\n", - "def q3_pandas_native(\n", - " customer_ds: Any,\n", - " line_item_ds: Any,\n", - " orders_ds: Any,\n", - ") -> Any:\n", - " var1 = \"BUILDING\"\n", - " var2 = date(1995, 3, 15)\n", - "\n", - " fcustomer = customer_ds[customer_ds[\"c_mktsegment\"] == var1]\n", - "\n", - " jn1 = fcustomer.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", - " jn2 = jn1.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", - "\n", - " jn2 = jn2[jn2[\"o_orderdate\"] < var2]\n", - " jn2 = jn2[jn2[\"l_shipdate\"] > var2]\n", - " jn2[\"revenue\"] = jn2.l_extendedprice * (1 - jn2.l_discount)\n", - "\n", - " gb = jn2.groupby([\"o_orderkey\", \"o_orderdate\", \"o_shippriority\"], as_index=False)\n", - " agg = gb[\"revenue\"].sum()\n", - "\n", - " sel = agg.loc[:, [\"o_orderkey\", \"revenue\", \"o_orderdate\", \"o_shippriority\"]]\n", - " sel = sel.rename({\"o_orderkey\": \"l_orderkey\"}, axis=\"columns\")\n", - "\n", - " sorted = sel.sort_values(by=[\"revenue\", \"o_orderdate\"], ascending=[False, True])\n", - "\n", - " return sorted.head(10) # type: ignore[no-any-return]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q3(\n", - " customer_ds_raw: Any,\n", - " line_item_ds_raw: Any,\n", - " orders_ds_raw: Any,\n", - ") -> Any:\n", - " var_1 = var_2 = datetime(1995, 3, 15)\n", - " var_3 = \"BUILDING\"\n", - "\n", - " customer_ds = nw.from_native(customer_ds_raw)\n", - " line_item_ds = nw.from_native(line_item_ds_raw)\n", - " orders_ds = nw.from_native(orders_ds_raw)\n", - "\n", - " q_final = (\n", - " customer_ds.filter(nw.col(\"c_mktsegment\") == var_3)\n", - " .join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", - " .join(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", - " .filter(\n", - " nw.col(\"o_orderdate\") < var_2,\n", - " nw.col(\"l_shipdate\") > var_1,\n", - " )\n", - " .with_columns(\n", - " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"revenue\")\n", - " )\n", - " .group_by([\"o_orderkey\", \"o_orderdate\", \"o_shippriority\"])\n", - " .agg([nw.sum(\"revenue\")])\n", - " .select(\n", - " [\n", - " nw.col(\"o_orderkey\").alias(\"l_orderkey\"),\n", - " \"revenue\",\n", - " \"o_orderdate\",\n", - " \"o_shippriority\",\n", - " ]\n", - " )\n", - " .sort(by=[\"revenue\", \"o_orderdate\"], descending=[True, False])\n", - " .head(10)\n", - " )\n", - "\n", - " return nw.to_native(q_final)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "import ibis\n", - "\n", - "\n", - "def q3_ibis(\n", - " customer: Any,\n", - " lineitem: Any,\n", - " orders: Any,\n", - " *,\n", - " tool: str,\n", - ") -> Any:\n", - " var1 = \"BUILDING\"\n", - " var2 = date(1995, 3, 15)\n", - "\n", - " q_final = (\n", - " customer.filter(customer[\"c_mktsegment\"] == var1)\n", - " .join(orders, customer[\"c_custkey\"] == orders[\"o_custkey\"])\n", - " .join(lineitem, orders[\"o_orderkey\"] == lineitem[\"l_orderkey\"])\n", - " .filter(ibis._[\"o_orderdate\"] < var2)\n", - " .filter(ibis._[\"l_shipdate\"] > var2)\n", - " .mutate(revenue=(lineitem[\"l_extendedprice\"] * (1 - lineitem[\"l_discount\"])))\n", - " .group_by(\n", - " \"o_orderkey\",\n", - " \"o_orderdate\",\n", - " \"o_shippriority\",\n", - " )\n", - " .agg(revenue=ibis._[\"revenue\"].sum())\n", - " .select(\n", - " ibis._[\"o_orderkey\"].name(\"o_orderkey\"),\n", - " \"revenue\",\n", - " \"o_orderdate\",\n", - " \"o_shippriority\",\n", - " )\n", - " .order_by(ibis.desc(\"revenue\"), \"o_orderdate\")\n", - " .limit(10)\n", - " )\n", - " if tool == \"pandas\":\n", - " return q_final.to_pandas()\n", - " if tool == \"polars\":\n", - " return q_final.to_polars()\n", - " raise ValueError(\"expected pandas or polars\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + \"region.parquet\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "customer = dir_ + \"customer.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "orders = dir_ + \"orders.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"\n", - "part = dir_ + \"part.parquet\"\n", - "partsupp = dir_ + \"partsupp.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "con_pd = ibis.pandas.connect()\n", - "con_pl = ibis.polars.connect()\n", - "\n", - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtypes, via ibis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow][ibis]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='pandas')\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": {}, - "source": [ - "## Polars, lazy, via ibis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"polars[lazy][ibis]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='polars')\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtypes, native" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q3_pandas_native(fn(customer), fn(lineitem), fn(orders))\n", - "results[tool + \"[native]\"] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "16", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "18", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.700300", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.706350", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "20", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21", - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "22", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796716, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796952, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796969, - "sourceType": "kernelVersion" - } - ], - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tpch/notebooks/q3/kernel-metadata.json b/tpch/notebooks/q3/kernel-metadata.json deleted file mode 100644 index cd0288106..000000000 --- a/tpch/notebooks/q3/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q3-s2", - "title": "Narwhals TPCH Q3 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q4/execute.ipynb b/tpch/notebooks/q4/execute.ipynb deleted file mode 100755 index b0a55e345..000000000 --- a/tpch/notebooks/q4/execute.ipynb +++ /dev/null @@ -1,507 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import date\n", - "from typing import Any\n", - "\n", - "\n", - "def q4_pandas_native(\n", - " line_item_ds: Any,\n", - " orders_ds: Any,\n", - "):\n", - " var1 = date(1993, 7, 1)\n", - " var2 = date(1993, 10, 1)\n", - "\n", - " jn = line_item_ds.merge(orders_ds, left_on=\"l_orderkey\", right_on=\"o_orderkey\")\n", - "\n", - " jn = jn[\n", - " (jn[\"o_orderdate\"] < var2)\n", - " & (jn[\"o_orderdate\"] >= var1)\n", - " & (jn[\"l_commitdate\"] < jn[\"l_receiptdate\"])\n", - " ]\n", - "\n", - " jn = jn.drop_duplicates(subset=[\"o_orderpriority\", \"l_orderkey\"])\n", - "\n", - " gb = jn.groupby(\"o_orderpriority\", as_index=False)\n", - " agg = gb.agg(order_count=pd.NamedAgg(column=\"o_orderkey\", aggfunc=\"count\"))\n", - "\n", - " return agg.sort_values([\"o_orderpriority\"]) # type: ignore[no-any-return]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q4(\n", - " lineitem_ds_raw: Any,\n", - " orders_ds_raw: Any,\n", - ") -> Any:\n", - " var_1 = datetime(1993, 7, 1)\n", - " var_2 = datetime(1993, 10, 1)\n", - "\n", - " line_item_ds = nw.from_native(lineitem_ds_raw)\n", - " orders_ds = nw.from_native(orders_ds_raw)\n", - "\n", - " result = (\n", - " line_item_ds.join(orders_ds, left_on=\"l_orderkey\", right_on=\"o_orderkey\")\n", - " .filter(\n", - " nw.col(\"o_orderdate\").is_between(var_1, var_2, closed=\"left\"),\n", - " nw.col(\"l_commitdate\") < nw.col(\"l_receiptdate\"),\n", - " )\n", - " .unique(subset=[\"o_orderpriority\", \"l_orderkey\"])\n", - " .group_by(\"o_orderpriority\")\n", - " .agg(nw.len().alias(\"order_count\"))\n", - " .sort(by=\"o_orderpriority\")\n", - " .with_columns(nw.col(\"order_count\").cast(nw.Int64))\n", - " )\n", - "\n", - " return nw.to_native(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "import ibis\n", - "\n", - "\n", - "def q4_ibis(lineitem: Any, orders: Any, *, tool: str) -> Any:\n", - " var1 = datetime(1993, 7, 1)\n", - " var2 = datetime(1993, 10, 1)\n", - "\n", - " q_final = (\n", - " lineitem.join(orders, lineitem[\"l_orderkey\"] == orders[\"o_orderkey\"])\n", - " .filter((orders[\"o_orderdate\"] >= var1) & (orders[\"o_orderdate\"] < var2))\n", - " .filter(lineitem[\"l_commitdate\"] < lineitem[\"l_receiptdate\"])\n", - " .distinct(on=[\"o_orderpriority\", \"l_orderkey\"])\n", - " .group_by(\"o_orderpriority\")\n", - " .agg(order_count=ibis._.count())\n", - " .order_by(\"o_orderpriority\")\n", - " )\n", - " if tool == \"pandas\":\n", - " return q_final.to_pandas()\n", - " if tool == \"polars\":\n", - " return q_final.to_polars()\n", - " raise ValueError(\"expected pandas or polars\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + \"region.parquet\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "customer = dir_ + \"customer.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "orders = dir_ + \"orders.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"\n", - "part = dir_ + \"part.parquet\"\n", - "partsupp = dir_ + \"partsupp.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "con_pd = ibis.pandas.connect()\n", - "con_pl = ibis.polars.connect()\n", - "\n", - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": {}, - "source": [ - "## polars, lazy, via ibis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"polars[lazy][ibis]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q4_ibis(fn(lineitem), fn(orders), tool='polars')\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtype, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q4_pandas_native(fn(lineitem), fn(orders))\n", - "results[tool + \"[native]\"] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "16", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.700300", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.706350", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "18", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q4(fn(lineitem), fn(orders)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "20", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796716, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796952, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796969, - "sourceType": "kernelVersion" - } - ], - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tpch/notebooks/q4/kernel-metadata.json b/tpch/notebooks/q4/kernel-metadata.json deleted file mode 100644 index 4e3936fbd..000000000 --- a/tpch/notebooks/q4/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q4-s2", - "title": "Narwhals TPCH Q4 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q5/execute.ipynb b/tpch/notebooks/q5/execute.ipynb deleted file mode 100755 index da0cae78b..000000000 --- a/tpch/notebooks/q5/execute.ipynb +++ /dev/null @@ -1,543 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import date\n", - "from typing import Any\n", - "\n", - "\n", - "def q5_pandas_native(\n", - " region_ds: Any,\n", - " nation_ds: Any,\n", - " customer_ds: Any,\n", - " line_item_ds: Any,\n", - " orders_ds: Any,\n", - " supplier_ds: Any,\n", - "):\n", - " var1 = \"ASIA\"\n", - " var2 = date(1994, 1, 1)\n", - " var3 = date(1995, 1, 1)\n", - "\n", - " jn1 = region_ds.merge(nation_ds, left_on=\"r_regionkey\", right_on=\"n_regionkey\")\n", - " jn2 = jn1.merge(customer_ds, left_on=\"n_nationkey\", right_on=\"c_nationkey\")\n", - " jn3 = jn2.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", - " jn4 = jn3.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", - " jn5 = jn4.merge(\n", - " supplier_ds,\n", - " left_on=[\"l_suppkey\", \"n_nationkey\"],\n", - " right_on=[\"s_suppkey\", \"s_nationkey\"],\n", - " )\n", - "\n", - " jn5 = jn5[jn5[\"r_name\"] == var1]\n", - " jn5 = jn5[(jn5[\"o_orderdate\"] >= var2) & (jn5[\"o_orderdate\"] < var3)]\n", - " jn5[\"revenue\"] = jn5.l_extendedprice * (1.0 - jn5.l_discount)\n", - "\n", - " gb = jn5.groupby(\"n_name\", as_index=False)[\"revenue\"].sum()\n", - "\n", - " return gb.sort_values(\"revenue\", ascending=False) # type: ignore[no-any-return]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q5(\n", - " region_ds_raw: Any,\n", - " nation_ds_raw: Any,\n", - " customer_ds_raw: Any,\n", - " lineitem_ds_raw: Any,\n", - " orders_ds_raw: Any,\n", - " supplier_ds_raw: Any,\n", - ") -> Any:\n", - " var_1 = \"ASIA\"\n", - " var_2 = datetime(1994, 1, 1)\n", - " var_3 = datetime(1995, 1, 1)\n", - "\n", - " region_ds = nw.from_native(region_ds_raw)\n", - " nation_ds = nw.from_native(nation_ds_raw)\n", - " customer_ds = nw.from_native(customer_ds_raw)\n", - " line_item_ds = nw.from_native(lineitem_ds_raw)\n", - " orders_ds = nw.from_native(orders_ds_raw)\n", - " supplier_ds = nw.from_native(supplier_ds_raw)\n", - "\n", - " result = (\n", - " region_ds.join(nation_ds, left_on=\"r_regionkey\", right_on=\"n_regionkey\")\n", - " .join(customer_ds, left_on=\"n_nationkey\", right_on=\"c_nationkey\")\n", - " .join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", - " .join(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", - " .join(\n", - " supplier_ds,\n", - " left_on=[\"l_suppkey\", \"n_nationkey\"],\n", - " right_on=[\"s_suppkey\", \"s_nationkey\"],\n", - " )\n", - " .filter(\n", - " nw.col(\"r_name\") == var_1,\n", - " nw.col(\"o_orderdate\").is_between(var_2, var_3, closed=\"left\"),\n", - " )\n", - " .with_columns(\n", - " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"revenue\")\n", - " )\n", - " .group_by(\"n_name\")\n", - " .agg([nw.sum(\"revenue\")])\n", - " .sort(by=\"revenue\", descending=True)\n", - " )\n", - "\n", - " return nw.to_native(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "import ibis\n", - "\n", - "\n", - "def q5_ibis(\n", - " region: Any,\n", - " nation: Any,\n", - " customer: Any,\n", - " lineitem: Any,\n", - " orders: Any,\n", - " supplier: Any,\n", - " *,\n", - " tool: str,\n", - ") -> Any:\n", - " var1 = \"ASIA\"\n", - " var2 = datetime(1994, 1, 1)\n", - " var3 = datetime(1995, 1, 1)\n", - "\n", - " q_final = (\n", - " region.join(nation, region[\"r_regionkey\"] == nation[\"n_regionkey\"])\n", - " .join(customer, ibis._[\"n_nationkey\"] == customer[\"c_nationkey\"])\n", - " .join(orders, ibis._[\"c_custkey\"] == orders[\"o_custkey\"])\n", - " .join(lineitem, ibis._[\"o_orderkey\"] == lineitem[\"l_orderkey\"])\n", - " .join(\n", - " supplier,\n", - " (ibis._[\"l_suppkey\"] == supplier[\"s_suppkey\"])\n", - " & (ibis._[\"n_nationkey\"] == supplier[\"s_nationkey\"]),\n", - " )\n", - " .filter(ibis._[\"r_name\"] == var1)\n", - " .filter((ibis._[\"o_orderdate\"] >= var2) & (ibis._[\"o_orderdate\"] < var3))\n", - " .mutate(revenue=(lineitem[\"l_extendedprice\"] * (1 - lineitem[\"l_discount\"])))\n", - " .group_by(\"n_name\")\n", - " .agg(revenue=ibis._[\"revenue\"].sum())\n", - " .order_by(ibis.desc(\"revenue\"))\n", - " )\n", - "\n", - " if tool == \"pandas\":\n", - " return q_final.to_pandas()\n", - " if tool == \"polars\":\n", - " return q_final.to_polars()\n", - " raise ValueError(\"expected pandas or polars\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + \"region.parquet\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "customer = dir_ + \"customer.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "orders = dir_ + \"orders.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"\n", - "part = dir_ + \"part.parquet\"\n", - "partsupp = dir_ + \"partsupp.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "con_pd = ibis.pandas.connect()\n", - "con_pl = ibis.polars.connect()\n", - "\n", - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": {}, - "source": [ - "## Polars, lazy, via ibis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"polars[lazy][ibis]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q5_ibis(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtypes, native" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q5_pandas_native(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool + \"[native]\"] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "16", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.700300", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.706350", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "18", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "20", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796716, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796952, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796969, - "sourceType": "kernelVersion" - } - ], - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tpch/notebooks/q5/kernel-metadata.json b/tpch/notebooks/q5/kernel-metadata.json deleted file mode 100644 index fe2fecc96..000000000 --- a/tpch/notebooks/q5/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q5-s2", - "title": "Narwhals TPCH Q5 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q6/execute.ipynb b/tpch/notebooks/q6/execute.ipynb deleted file mode 100755 index 5abcb65f0..000000000 --- a/tpch/notebooks/q6/execute.ipynb +++ /dev/null @@ -1,516 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import date\n", - "\n", - "\n", - "def q6_pandas_native(line_item_ds):\n", - " var1 = date(1994, 1, 1)\n", - " var2 = date(1995, 1, 1)\n", - " var3 = 0.05\n", - " var4 = 0.07\n", - " var5 = 24\n", - "\n", - " flineitem = line_item_ds[\n", - " (line_item_ds[\"l_shipdate\"] >= var1)\n", - " & (line_item_ds[\"l_shipdate\"] < var2)\n", - " & (line_item_ds[\"l_discount\"] >= var3)\n", - " & (line_item_ds[\"l_discount\"] <= var4)\n", - " & (line_item_ds[\"l_quantity\"] < var5)\n", - " ]\n", - "\n", - " result_value = (flineitem[\"l_extendedprice\"] * flineitem[\"l_discount\"]).sum()\n", - "\n", - " return pd.DataFrame({\"revenue\": [result_value]})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q6(line_item_raw) -> None:\n", - " var_1 = datetime(1994, 1, 1)\n", - " var_2 = datetime(1995, 1, 1)\n", - " var_3 = 24\n", - "\n", - " line_item_ds = nw.from_native(line_item_raw)\n", - "\n", - " result = (\n", - " line_item_ds.filter(\n", - " nw.col(\"l_shipdate\").is_between(var_1, var_2, closed=\"left\"),\n", - " nw.col(\"l_discount\").is_between(0.05, 0.07),\n", - " nw.col(\"l_quantity\") < var_3,\n", - " )\n", - " .with_columns((nw.col(\"l_extendedprice\") * nw.col(\"l_discount\")).alias(\"revenue\"))\n", - " .select(nw.sum(\"revenue\"))\n", - " )\n", - " return nw.to_native(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "def q6_ibis(lineitem, *, tool: str) -> None:\n", - " var1 = datetime(1994, 1, 1)\n", - " var2 = datetime(1995, 1, 1)\n", - " var3 = 0.05\n", - " var4 = 0.07\n", - " var5 = 24\n", - "\n", - " q_final = (\n", - " lineitem.filter(\n", - " (lineitem[\"l_shipdate\"] >= var1) & (lineitem[\"l_shipdate\"] < var2)\n", - " )\n", - " .filter((lineitem[\"l_discount\"] >= var3) & (lineitem[\"l_discount\"] <= var4))\n", - " .filter(lineitem[\"l_quantity\"] < var5)\n", - " .mutate(revenue=ibis._[\"l_extendedprice\"] * (ibis._[\"l_discount\"]))\n", - " .agg(revenue=ibis._[\"revenue\"].sum())\n", - " )\n", - "\n", - " if tool == \"pandas\":\n", - " return q_final.to_pandas()\n", - " if tool == \"polars\":\n", - " return q_final.to_polars()\n", - " raise ValueError(\"expected pandas or polars\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + \"region.parquet\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "customer = dir_ + \"customer.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "orders = dir_ + \"orders.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"\n", - "part = dir_ + \"part.parquet\"\n", - "partsupp = dir_ + \"partsupp.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import ibis\n", - "\n", - "con_pd = ibis.pandas.connect()\n", - "con_pl = ibis.polars.connect()\n", - "\n", - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtypes, via ibis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow][ibis]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q6_ibis(fn(lineitem), tool='pandas')\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": {}, - "source": [ - "## polars, lazy, via ibis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"polars[lazy][ibis]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q6_ibis(fn(lineitem), tool='polars')\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtypes, native" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q6_pandas_native(fn(lineitem))\n", - "results[tool + \"[native]\"] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q6(fn(lineitem))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "16", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q6(fn(lineitem))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "18", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.700300", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.706350", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q6(fn(lineitem))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "20", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21", - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q6(fn(lineitem)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "22", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796716, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796952, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796969, - "sourceType": "kernelVersion" - } - ], - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tpch/notebooks/q6/kernel-metadata.json b/tpch/notebooks/q6/kernel-metadata.json deleted file mode 100644 index d653fd2e9..000000000 --- a/tpch/notebooks/q6/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q6-s2", - "title": "Narwhals TPCH Q6 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q7/execute.ipynb b/tpch/notebooks/q7/execute.ipynb deleted file mode 100755 index 8711d7505..000000000 --- a/tpch/notebooks/q7/execute.ipynb +++ /dev/null @@ -1,603 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ibis-framework" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import date\n", - "from datetime import datetime\n", - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q7_pandas_native(\n", - " nation_ds,\n", - " customer_ds,\n", - " line_item_ds,\n", - " orders_ds,\n", - " supplier_ds,\n", - ") -> None:\n", - " var1 = \"FRANCE\"\n", - " var2 = \"GERMANY\"\n", - " var3 = date(1995, 1, 1)\n", - " var4 = date(1996, 12, 31)\n", - "\n", - " n1 = nation_ds[(nation_ds[\"n_name\"] == var1)]\n", - " n2 = nation_ds[(nation_ds[\"n_name\"] == var2)]\n", - "\n", - " # Part 1\n", - " jn1 = customer_ds.merge(n1, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", - " jn2 = jn1.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", - " jn2 = jn2.rename({\"n_name\": \"cust_nation\"}, axis=\"columns\")\n", - " jn3 = jn2.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", - " jn4 = jn3.merge(supplier_ds, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", - " jn5 = jn4.merge(n2, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", - " df1 = jn5.rename({\"n_name\": \"supp_nation\"}, axis=\"columns\")\n", - "\n", - " # Part 2\n", - " jn1 = customer_ds.merge(n2, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", - " jn2 = jn1.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", - " jn2 = jn2.rename({\"n_name\": \"cust_nation\"}, axis=\"columns\")\n", - " jn3 = jn2.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", - " jn4 = jn3.merge(supplier_ds, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", - " jn5 = jn4.merge(n1, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", - " df2 = jn5.rename({\"n_name\": \"supp_nation\"}, axis=\"columns\")\n", - "\n", - " # Combine\n", - " total = pd.concat([df1, df2])\n", - "\n", - " total = total[(total[\"l_shipdate\"] >= var3) & (total[\"l_shipdate\"] <= var4)]\n", - " total[\"volume\"] = total[\"l_extendedprice\"] * (1.0 - total[\"l_discount\"])\n", - " total[\"l_year\"] = total[\"l_shipdate\"].dt.year\n", - "\n", - " gb = total.groupby([\"supp_nation\", \"cust_nation\", \"l_year\"], as_index=False)\n", - " agg = gb.agg(revenue=pd.NamedAgg(column=\"volume\", aggfunc=\"sum\"))\n", - "\n", - " return agg.sort_values(by=[\"supp_nation\", \"cust_nation\", \"l_year\"]) # type: ignore[no-any-return]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "def q7(\n", - " nation_ds,\n", - " customer_ds,\n", - " line_item_ds,\n", - " orders_ds,\n", - " supplier_ds,\n", - ") -> None:\n", - " nation_ds = nw.from_native(nation_ds)\n", - " customer_ds = nw.from_native(customer_ds)\n", - " line_item_ds = nw.from_native(line_item_ds)\n", - " orders_ds = nw.from_native(orders_ds)\n", - " supplier_ds = nw.from_native(supplier_ds)\n", - "\n", - " n1 = nation_ds.filter(nw.col(\"n_name\") == \"FRANCE\")\n", - " n2 = nation_ds.filter(nw.col(\"n_name\") == \"GERMANY\")\n", - "\n", - " var_1 = datetime(1995, 1, 1)\n", - " var_2 = datetime(1996, 12, 31)\n", - "\n", - " df1 = (\n", - " customer_ds.join(n1, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", - " .join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", - " .rename({\"n_name\": \"cust_nation\"})\n", - " .join(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", - " .join(supplier_ds, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", - " .join(n2, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", - " .rename({\"n_name\": \"supp_nation\"})\n", - " )\n", - "\n", - " df2 = (\n", - " customer_ds.join(n2, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", - " .join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", - " .rename({\"n_name\": \"cust_nation\"})\n", - " .join(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", - " .join(supplier_ds, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", - " .join(n1, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", - " .rename({\"n_name\": \"supp_nation\"})\n", - " )\n", - "\n", - " result = (\n", - " nw.concat([df1, df2])\n", - " .filter(nw.col(\"l_shipdate\").is_between(var_1, var_2))\n", - " .with_columns(\n", - " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"volume\")\n", - " )\n", - " .with_columns(nw.col(\"l_shipdate\").dt.year().alias(\"l_year\"))\n", - " .group_by(\"supp_nation\", \"cust_nation\", \"l_year\")\n", - " .agg(nw.sum(\"volume\").alias(\"revenue\"))\n", - " .sort(by=[\"supp_nation\", \"cust_nation\", \"l_year\"])\n", - " )\n", - " return nw.to_native(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "import ibis\n", - "\n", - "\n", - "def q7_ibis(\n", - " nation: Any, customer: Any, lineitem: Any, orders: Any, supplier: Any, *, tool: str\n", - ") -> None:\n", - " var1 = \"FRANCE\"\n", - " var2 = \"GERMANY\"\n", - " var3 = datetime(1995, 1, 1)\n", - " var4 = datetime(1996, 12, 31)\n", - "\n", - " n1 = nation.filter(nation[\"n_name\"] == var1)\n", - " n2 = nation.filter(nation[\"n_name\"] == var2)\n", - "\n", - " q1 = (\n", - " customer.join(n1, customer[\"c_nationkey\"] == n1[\"n_nationkey\"])\n", - " .join(orders, customer[\"c_custkey\"] == orders[\"o_custkey\"])\n", - " .rename({\"cust_nation\": \"n_name\"})\n", - " .join(lineitem, orders[\"o_orderkey\"] == lineitem[\"l_orderkey\"])\n", - " .join(supplier, lineitem[\"l_suppkey\"] == supplier[\"s_suppkey\"])\n", - " .join(n2, supplier[\"s_nationkey\"] == n2[\"n_nationkey\"])\n", - " .rename({\"supp_nation\": \"n_name\"})\n", - " )\n", - "\n", - " q2 = (\n", - " customer.join(n2, customer[\"c_nationkey\"] == n2[\"n_nationkey\"])\n", - " .join(orders, customer[\"c_custkey\"] == orders[\"o_custkey\"])\n", - " .rename({\"cust_nation\": \"n_name\"})\n", - " .join(lineitem, orders[\"o_orderkey\"] == lineitem[\"l_orderkey\"])\n", - " .join(supplier, lineitem[\"l_suppkey\"] == supplier[\"s_suppkey\"])\n", - " .join(n1, supplier[\"s_nationkey\"] == n1[\"n_nationkey\"])\n", - " .rename({\"supp_nation\": \"n_name\"})\n", - " )\n", - "\n", - " q_final = (\n", - " q1.union(q2)\n", - " .filter((ibis._[\"l_shipdate\"] >= var3) & (ibis._[\"l_shipdate\"] <= var4))\n", - " .mutate(\n", - " volume=(ibis._[\"l_extendedprice\"] * (1 - ibis._[\"l_discount\"])),\n", - " l_year=ibis._[\"l_shipdate\"].year(),\n", - " )\n", - " .group_by(\"supp_nation\", \"cust_nation\", \"l_year\")\n", - " .agg(revenue=ibis._[\"volume\"].sum())\n", - " .order_by(\"supp_nation\", \"cust_nation\", \"l_year\")\n", - " )\n", - "\n", - " if tool == \"pandas\":\n", - " return q_final.to_pandas()\n", - " if tool == \"polars\":\n", - " return q_final.to_polars()\n", - " raise ValueError(\"expected pandas or polars\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + \"region.parquet\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "customer = dir_ + \"customer.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "orders = dir_ + \"orders.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"\n", - "part = dir_ + \"part.parquet\"\n", - "partsupp = dir_ + \"partsupp.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "con_pd = ibis.pandas.connect()\n", - "con_pl = ibis.polars.connect()\n", - "\n", - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtypes, via ibis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow][ibis]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='pandas')\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": {}, - "source": [ - "## polars, lazy, via ibis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"polars[lazy][ibis]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtypes, native" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q7_pandas_native(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool + \"[native]\"] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "16", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "18", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.700300", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.706350", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "20", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21", - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "22", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796716, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796952, - "sourceType": "kernelVersion" - }, - { - "sourceId": 167796969, - "sourceType": "kernelVersion" - } - ], - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.19" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tpch/notebooks/q7/kernel-metadata.json b/tpch/notebooks/q7/kernel-metadata.json deleted file mode 100644 index 65997e8ae..000000000 --- a/tpch/notebooks/q7/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q7-s2", - "title": "Narwhals TPCH Q7 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q9/execute.ipynb b/tpch/notebooks/q9/execute.ipynb deleted file mode 100644 index 802799a01..000000000 --- a/tpch/notebooks/q9/execute.ipynb +++ /dev/null @@ -1,378 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 33.390992, - "end_time": "2024-03-22T17:24:15.601719", - "exception": false, - "start_time": "2024-03-22T17:23:42.210727", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.907754, - "end_time": "2024-03-22T17:24:39.053873", - "exception": false, - "start_time": "2024-03-22T17:24:38.146119", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "import narwhals as nw\n", - "\n", - "\n", - "def q9(\n", - " part_ds_raw: Any,\n", - " partsupp_ds_raw: Any,\n", - " nation_ds_raw: Any,\n", - " lineitem_ds_raw: Any,\n", - " orders_ds_raw: Any,\n", - " supplier_ds_raw: Any,\n", - ") -> Any:\n", - " part_ds = nw.from_native(part_ds_raw)\n", - " nation_ds = nw.from_native(nation_ds_raw)\n", - " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", - " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", - " orders_ds = nw.from_native(orders_ds_raw)\n", - " supplier_ds = nw.from_native(supplier_ds_raw)\n", - "\n", - " result = (\n", - " part_ds.join(partsupp_ds, left_on=\"p_partkey\", right_on=\"ps_partkey\")\n", - " .join(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n", - " .join(\n", - " lineitem_ds,\n", - " left_on=[\"p_partkey\", \"ps_suppkey\"],\n", - " right_on=[\"l_partkey\", \"l_suppkey\"],\n", - " )\n", - " .join(orders_ds, left_on=\"l_orderkey\", right_on=\"o_orderkey\")\n", - " .join(nation_ds, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", - " .filter(nw.col(\"p_name\").str.contains(\"green\"))\n", - " .select(\n", - " nw.col(\"n_name\").alias(\"nation\"),\n", - " nw.col(\"o_orderdate\").dt.year().alias(\"o_year\"),\n", - " (\n", - " nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))\n", - " - nw.col(\"ps_supplycost\") * nw.col(\"l_quantity\")\n", - " ).alias(\"amount\"),\n", - " )\n", - " .group_by(\"nation\", \"o_year\")\n", - " .agg(nw.sum(\"amount\").alias(\"sum_profit\"))\n", - " .sort(by=[\"nation\", \"o_year\"], descending=[False, True])\n", - " )\n", - "\n", - " return nw.to_native(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.013325, - "end_time": "2024-03-22T17:24:39.099766", - "exception": false, - "start_time": "2024-03-22T17:24:39.086441", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + \"nation.parquet\"\n", - "lineitem = dir_ + \"lineitem.parquet\"\n", - "orders = dir_ + \"orders.parquet\"\n", - "supplier = dir_ + \"supplier.parquet\"\n", - "part = dir_ + \"part.parquet\"\n", - "partsupp = dir_ + \"partsupp.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.014284, - "end_time": "2024-03-22T17:24:39.119737", - "exception": false, - "start_time": "2024-03-22T17:24:39.105453", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "IO_FUNCS = {\n", - " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", - " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", - " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", - " ),\n", - " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", - " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005113, - "end_time": "2024-03-22T17:24:39.130472", - "exception": false, - "start_time": "2024-03-22T17:24:39.125359", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 196.786925, - "end_time": "2024-03-22T17:27:55.922832", - "exception": false, - "start_time": "2024-03-22T17:24:39.135907", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005184, - "end_time": "2024-03-22T17:27:55.933407", - "exception": false, - "start_time": "2024-03-22T17:27:55.928223", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 158.748353, - "end_time": "2024-03-22T17:30:34.688289", - "exception": false, - "start_time": "2024-03-22T17:27:55.939936", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"pandas[pyarrow]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005773, - "end_time": "2024-03-22T17:30:34.7003", - "exception": false, - "start_time": "2024-03-22T17:30:34.694527", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 37.821116, - "end_time": "2024-03-22T17:31:12.527466", - "exception": false, - "start_time": "2024-03-22T17:30:34.70635", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[eager]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005515, - "end_time": "2024-03-22T17:31:12.539068", - "exception": false, - "start_time": "2024-03-22T17:31:12.533553", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 4.800698, - "end_time": "2024-03-22T17:31:17.346813", - "exception": false, - "start_time": "2024-03-22T17:31:12.546115", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = \"polars[lazy]\"\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open(\"results.json\", \"w\") as fd:\n", - " json.dump(results, fd)" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - } - ], - "dockerImageVersionId": 30673, - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 458.423327, - "end_time": "2024-03-22T17:31:18.077306", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-03-22T17:23:39.653979", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tpch/notebooks/q9/kernel-metadata.json b/tpch/notebooks/q9/kernel-metadata.json deleted file mode 100644 index bdbebcfeb..000000000 --- a/tpch/notebooks/q9/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q9-s2", - "title": "Narwhals TPCH Q9 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file From 3f186da4768e9a24c10d1ed887d61081e05087dc Mon Sep 17 00:00:00 2001 From: Alessandro Miola <37796412+AlessandroMiola@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:14:11 +0200 Subject: [PATCH 119/145] ci: add downstream tests for `marimo` (#1166) --- .github/workflows/downstream_tests.yml | 58 ++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml index d84129d79..542dacbbf 100644 --- a/.github/workflows/downstream_tests.yml +++ b/.github/workflows/downstream_tests.yml @@ -50,6 +50,64 @@ jobs: cd altair mypy altair tests + marimo: + strategy: + matrix: + python-version: ["3.12"] + os: [ubuntu-latest] + dependencies: ["core", "core,optional"] + + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: "true" + cache-suffix: ${{ matrix.python-version }} + cache-dependency-glob: "**requirements*.txt" + - name: clone-marimo + run: | + git clone https://github.com/marimo-team/marimo.git --depth=1 + cd marimo + git log + - name: install-basics + run: uv pip install --upgrade tox virtualenv setuptools hatch --system + - name: install-marimo-dev + run: | + cd marimo + uv pip install -e ".[dev]" --system + - name: install-narwhals-dev + run: | + uv pip uninstall narwhals --system + uv pip install -e . --system + - name: show-deps + run: uv pip freeze + - name: Create assets directory, copy over index.html + run: | + mkdir -p marimo/marimo/_static/assets + cp marimo/frontend/index.html marimo/marimo/_static/index.html + cp marimo/frontend/public/favicon.ico marimo/marimo/_static/favicon.ico + - name: Run tests with minimal dependencies + if: ${{ matrix.dependencies == 'core' }} + run: | + cd marimo + hatch run +py=${{ matrix.python-version }} test:test -v tests/ -k "not test_cli" + timeout-minutes: 15 + - name: Run tests with optional dependencies + if: ${{ matrix.dependencies == 'core,optional' }} + run: | + cd marimo + hatch run +py=${{ matrix.python-version }} test-optional:test -v tests/ -k "not test_cli" + timeout-minutes: 15 + - name: Run typechecks + run: | + cd marimo + hatch run typecheck:check + scikit-lego: strategy: matrix: From 3e720b1fa4d5f99afd99153703c4b8b7a4859cb8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 16 Oct 2024 08:14:46 +0200 Subject: [PATCH 120/145] skip changelog(deps): bump actions/github-script from 6 to 7 (#1179) Bumps [actions/github-script](https://github.com/actions/github-script) from 6 to 7. - [Release notes](https://github.com/actions/github-script/releases) - [Commits](https://github.com/actions/github-script/compare/v6...v7) --- updated-dependencies: - dependency-name: actions/github-script dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/bump-version.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bump-version.yml b/.github/workflows/bump-version.yml index 3577fd0e5..356ee76c2 100644 --- a/.github/workflows/bump-version.yml +++ b/.github/workflows/bump-version.yml @@ -33,7 +33,7 @@ jobs: python utils/bump_version.py ${{ github.event.inputs.release_type }} - name: Create pull request - uses: actions/github-script@v6 + uses: actions/github-script@v7 if: github.actor == 'MarcoGorelli' || github.actor == 'FBruzzesi' with: script: | From 3a167ab2c2505328b01cd82865a3ecc7f1dc85ff Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 16 Oct 2024 08:07:11 +0100 Subject: [PATCH 121/145] only run doctests on py3.12 (#1189) --- .github/workflows/extremes.yml | 4 ---- .github/workflows/pytest.yml | 6 +---- narwhals/dataframe.py | 42 +++++++++++++++++----------------- narwhals/expr.py | 16 ++++++------- narwhals/functions.py | 2 +- narwhals/schema.py | 2 +- narwhals/series.py | 42 ++++++++++++++++------------------ narwhals/stable/v1/__init__.py | 4 ++-- 8 files changed, 53 insertions(+), 65 deletions(-) diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index 858d0b6e2..cf488fd2d 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -59,8 +59,6 @@ jobs: run: uv pip freeze - name: Run pytest run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow - - name: Run doctests - run: pytest narwhals --doctest-modules not_so_old_versions: strategy: @@ -88,8 +86,6 @@ jobs: run: uv pip freeze - name: Run pytest run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow - - name: Run doctests - run: pytest narwhals --doctest-modules nightlies: strategy: diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 20058a435..ee88911ea 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -30,9 +30,6 @@ jobs: run: uv pip freeze - name: Run pytest run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=85 - - name: Run doctests - if: startsWith(matrix.os, 'windows') != true - run: pytest narwhals --doctest-modules pytest-windows: strategy: @@ -60,8 +57,6 @@ jobs: run: uv pip freeze - name: Run pytest run: pytest tests --cov=narwhals --cov=tests --runslow --cov-fail-under=95 - - name: Run doctests - run: pytest narwhals --doctest-modules pytest-coverage: strategy: @@ -95,4 +90,5 @@ jobs: - name: Run pytest run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=100 --runslow - name: Run doctests + if: matrix.python-version == '3.12' run: pytest narwhals --doctest-modules diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index b0ac1c329..5c8c7e13e 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -546,12 +546,12 @@ def write_csv(self, file: str | Path | BytesIO | None = None) -> Any: We can pass any supported library such as pandas, Polars or PyArrow to `func`: - >>> func(df_pd) # doctest: +SKIP - 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n' - >>> func(df_pl) # doctest: +SKIP + >>> func(df_pd) 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n' - >>> func(df_pa) # doctest: +SKIP + >>> func(df_pl) 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n' + >>> func(df_pa) + '"foo","bar","ham"\n1,6,"a"\n2,7,"b"\n3,8,"c"\n' If we had passed a file name to `write_csv`, it would have been written to that file. @@ -582,9 +582,9 @@ def write_parquet(self, file: str | Path | BytesIO) -> Any: We can then pass either pandas, Polars or PyArrow to `func`: - >>> func(df_pd) # doctest:+SKIP - >>> func(df_pl) # doctest:+SKIP - >>> func(df_pa) # doctest:+SKIP + >>> func(df_pd) + >>> func(df_pl) + >>> func(df_pa) """ self._compliant_frame.write_parquet(file) @@ -1116,12 +1116,12 @@ def schema(self) -> Schema: You can pass either pandas or Polars to `func`: >>> df_pd_schema = func(df_pd) - >>> df_pd_schema # doctest:+SKIP - Schema({'foo': Int64, 'bar': Float64, 'ham', String}) + >>> df_pd_schema + Schema({'foo': Int64, 'bar': Float64, 'ham': String}) >>> df_pl_schema = func(df_pl) - >>> df_pl_schema # doctest:+SKIP - Schema({'foo': Int64, 'bar': Float64, 'ham', String}) + >>> df_pl_schema + Schema({'foo': Int64, 'bar': Float64, 'ham': String}) """ return super().schema @@ -1150,12 +1150,12 @@ def collect_schema(self: Self) -> Schema: You can pass either pandas or Polars to `func`: >>> df_pd_schema = func(df_pd) - >>> df_pd_schema # doctest:+SKIP - Schema({'foo': Int64, 'bar': Float64, 'ham', String}) + >>> df_pd_schema + Schema({'foo': Int64, 'bar': Float64, 'ham': String}) >>> df_pl_schema = func(df_pl) - >>> df_pl_schema # doctest:+SKIP - Schema({'foo': Int64, 'bar': Float64, 'ham', String}) + >>> df_pl_schema + Schema({'foo': Int64, 'bar': Float64, 'ham': String}) """ return super().collect_schema() @@ -2478,8 +2478,8 @@ def item(self: Self, row: int | None = None, column: int | str | None = None) -> We can then pass either pandas or Polars to `func`: - >>> func(df_pd, 1, 1), func(df_pd, 2, "b") # doctest:+SKIP - (5, 6) + >>> func(df_pd, 1, 1), func(df_pd, 2, "b") + (np.int64(5), np.int64(6)) >>> func(df_pl, 1, 1), func(df_pl, 2, "b") (5, 6) @@ -2581,7 +2581,7 @@ def to_arrow(self: Self) -> pa.Table: ... def func(df): ... return df.to_arrow() - >>> func(df_pd) # doctest:+SKIP + >>> func(df_pd) pyarrow.Table foo: int64 bar: string @@ -3010,7 +3010,7 @@ def schema(self) -> Schema: ... } ... ) >>> lf = nw.from_native(lf_pl) - >>> lf.schema # doctest:+SKIP + >>> lf.schema # doctest: +SKIP Schema({'foo': Int64, 'bar': Float64, 'ham', String}) """ return super().schema @@ -3030,8 +3030,8 @@ def collect_schema(self: Self) -> Schema: ... } ... ) >>> lf = nw.from_native(lf_pl) - >>> lf.collect_schema() # doctest:+SKIP - Schema({'foo': Int64, 'bar': Float64, 'ham', String}) + >>> lf.collect_schema() + Schema({'foo': Int64, 'bar': Float64, 'ham': String}) """ return super().collect_schema() diff --git a/narwhals/expr.py b/narwhals/expr.py index edd52b305..6eedbafa4 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1262,12 +1262,12 @@ def sample( We can then pass either pandas or Polars to `func`: - >>> func(df_pd) # doctest:+SKIP + >>> func(df_pd) # doctest: +SKIP a 2 3 0 1 2 3 - >>> func(df_pl) # doctest:+SKIP + >>> func(df_pl) # doctest: +SKIP shape: (3, 1) ┌─────┐ │ a │ @@ -2662,9 +2662,7 @@ def date(self) -> Expr: >>> from datetime import datetime >>> import narwhals as nw >>> data = {"a": [datetime(2012, 1, 7, 10, 20), datetime(2023, 3, 10, 11, 32)]} - >>> df_pd = pd.DataFrame(data).convert_dtypes( - ... dtype_backend="pyarrow" - ... ) # doctest:+SKIP + >>> df_pd = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") >>> df_pl = pl.DataFrame(data) We define a library agnostic function: @@ -2675,7 +2673,7 @@ def date(self) -> Expr: We can then pass either pandas or Polars to `func`: - >>> func(df_pd) # doctest:+SKIP + >>> func(df_pd) a 0 2012-01-07 1 2023-03-10 @@ -3546,7 +3544,7 @@ def replace_time_zone(self, time_zone: str | None) -> Expr: │ 2024-01-01 00:00:00 +0545 │ │ 2024-01-02 00:00:00 +0545 │ └──────────────────────────────┘ - >>> func(df_pa) # doctest:+SKIP + >>> func(df_pa) pyarrow.Table a: timestamp[us, tz=Asia/Kathmandu] ---- @@ -3601,7 +3599,7 @@ def convert_time_zone(self, time_zone: str) -> Expr: │ 2024-01-01 05:45:00 +0545 │ │ 2024-01-02 05:45:00 +0545 │ └──────────────────────────────┘ - >>> func(df_pa) # doctest:+SKIP + >>> func(df_pa) pyarrow.Table a: timestamp[us, tz=Asia/Kathmandu] ---- @@ -3898,7 +3896,7 @@ def nth(*indices: int | Sequence[int]) -> Expr: a 0 2 1 4 - >>> func(df_pl) # doctest: +SKIP + >>> func(df_pl) shape: (2, 1) ┌─────┐ │ a │ diff --git a/narwhals/functions.py b/narwhals/functions.py index e1505e78f..b84dcb174 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -469,7 +469,7 @@ def show_versions() -> None: Examples: >>> from narwhals import show_versions - >>> show_versions() # doctest:+SKIP + >>> show_versions() # doctest: +SKIP """ sys_info = _get_sys_info() diff --git a/narwhals/schema.py b/narwhals/schema.py index a33acd230..9eb8e3819 100644 --- a/narwhals/schema.py +++ b/narwhals/schema.py @@ -35,7 +35,7 @@ class Schema(BaseSchema): >>> import narwhals as nw >>> schema = nw.Schema({"foo": nw.Int8(), "bar": nw.String()}) - >>> schema # doctest:+SKIP + >>> schema Schema({'foo': Int8, 'bar': String}) Access the data type associated with a specific column name. diff --git a/narwhals/series.py b/narwhals/series.py index 0115ac34f..36ecf50ff 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -512,7 +512,7 @@ def mean(self) -> Any: We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP + >>> func(s_pd) np.float64(2.0) >>> func(s_pl) 2.0 @@ -539,7 +539,7 @@ def count(self) -> Any: We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP + >>> func(s_pd) np.int64(3) >>> func(s_pl) 3 @@ -570,7 +570,7 @@ def any(self) -> Any: We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP + >>> func(s_pd) np.True_ >>> func(s_pl) True @@ -597,7 +597,7 @@ def all(self) -> Any: We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP + >>> func(s_pd) np.False_ >>> func(s_pl) False @@ -625,7 +625,7 @@ def min(self) -> Any: We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP + >>> func(s_pd) np.int64(1) >>> func(s_pl) 1 @@ -652,7 +652,7 @@ def max(self) -> Any: We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP + >>> func(s_pd) np.int64(3) >>> func(s_pl) 3 @@ -679,7 +679,7 @@ def sum(self) -> Any: We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP + >>> func(s_pd) np.int64(6) >>> func(s_pl) 6 @@ -710,7 +710,7 @@ def std(self, *, ddof: int = 1) -> Any: We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP + >>> func(s_pd) np.float64(1.0) >>> func(s_pl) 1.0 @@ -1175,13 +1175,13 @@ def sample( We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP + >>> func(s_pd) # doctest: +SKIP a 2 3 1 2 3 4 3 4 - >>> func(s_pl) # doctest:+SKIP + >>> func(s_pl) # doctest: +SKIP shape: (4,) Series: '' [i64] [ @@ -1867,8 +1867,8 @@ def null_count(self: Self) -> int: ... return s.null_count() We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP - 1 + >>> func(s_pd) + np.int64(1) >>> func(s_pl) 2 """ @@ -2082,8 +2082,8 @@ def quantile( We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest: +SKIP - [5, 12, 24, 37, 44] + >>> func(s_pd) + [np.int64(5), np.int64(12), np.int64(24), np.int64(37), np.int64(44)] >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE [5.0, 12.0, 25.0, 37.0, 44.0] @@ -2166,8 +2166,8 @@ def item(self: Self, index: int | None = None) -> Any: We can then pass either pandas or Polars to `func`: - >>> func(pl.Series("a", [1]), None), func(pd.Series([1]), None) # doctest:+SKIP - (1, 1) + >>> func(pl.Series("a", [1]), None), func(pd.Series([1]), None) + (1, np.int64(1)) >>> func(pl.Series("a", [9, 8, 7]), -1), func(pl.Series([9, 8, 7]), -2) (7, 8) @@ -3175,9 +3175,7 @@ def date(self) -> Series: >>> from datetime import datetime >>> import narwhals as nw >>> dates = [datetime(2012, 1, 7, 10, 20), datetime(2023, 3, 10, 11, 32)] - >>> s_pd = pd.Series(dates).convert_dtypes( - ... dtype_backend="pyarrow" - ... ) # doctest:+SKIP + >>> s_pd = pd.Series(dates).convert_dtypes(dtype_backend="pyarrow") >>> s_pl = pl.Series(dates) We define a library agnostic function: @@ -3188,7 +3186,7 @@ def date(self) -> Series: We can then pass either pandas or Polars to `func`: - >>> func(s_pd) # doctest:+SKIP + >>> func(s_pd) 0 2012-01-07 1 2023-03-10 dtype: date32[day][pyarrow] @@ -3935,7 +3933,7 @@ def replace_time_zone(self, time_zone: str | None) -> Series: 2024-01-01 00:00:00 +0545 2024-01-02 00:00:00 +0545 ] - >>> func(s_pa) # doctest: +SKIP + >>> func(s_pa) [ [ @@ -3988,7 +3986,7 @@ def convert_time_zone(self, time_zone: str) -> Series: 2024-01-01 05:45:00 +0545 2024-01-02 05:45:00 +0545 ] - >>> func(s_pa) # doctest: +SKIP + >>> func(s_pa) [ [ diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index e9aac4cf4..86ddd1def 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -514,7 +514,7 @@ class Schema(NwSchema): >>> import narwhals.stable.v1 as nw >>> schema = nw.Schema({"foo": nw.Int8(), "bar": nw.String()}) - >>> schema # doctest:+SKIP + >>> schema Schema({'foo': Int8, 'bar': String}) Access the data type associated with a specific column name. @@ -1059,7 +1059,7 @@ def nth(*indices: int | Sequence[int]) -> Expr: a 0 2 1 4 - >>> func(df_pl) # doctest: +SKIP + >>> func(df_pl) shape: (2, 1) ┌─────┐ │ a │ From c19be3b7acb968bf85e5aa26453178544f2df19a Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Thu, 17 Oct 2024 05:59:23 -0400 Subject: [PATCH 122/145] test: update cuDF tests (#1196) * unxfail test_to_dummies_drop_first * xfail timezone tests for cuDF * fix typo --- .../expr_and_series/convert_time_zone_test.py | 20 +++++++++++++------ .../expr_and_series/replace_time_zone_test.py | 2 ++ tests/series_only/to_dummy_test.py | 6 +----- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tests/expr_and_series/convert_time_zone_test.py b/tests/expr_and_series/convert_time_zone_test.py index ee4ccaec4..d85637a14 100644 --- a/tests/expr_and_series/convert_time_zone_test.py +++ b/tests/expr_and_series/convert_time_zone_test.py @@ -17,8 +17,13 @@ def test_convert_time_zone( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) or ( - "pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2, 1) + if ( + (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) + or ( + "pandas_pyarrow" in str(constructor) + and parse_version(pd.__version__) < (2, 1) + ) + or ("cudf" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { @@ -41,10 +46,12 @@ def test_convert_time_zone_series( constructor_eager: Any, request: pytest.FixtureRequest ) -> None: if ( - any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows() - ) or ( - "pandas_pyarrow" in str(constructor_eager) - and parse_version(pd.__version__) < (2, 1) + (any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows()) + or ( + "pandas_pyarrow" in str(constructor_eager) + and parse_version(pd.__version__) < (2, 1) + ) + or ("cudf" in str(constructor_eager)) ): request.applymarker(pytest.mark.xfail) data = { @@ -73,6 +80,7 @@ def test_convert_time_zone_from_none( and parse_version(pd.__version__) < (2, 1) ) or ("pyarrow_table" in str(constructor) and parse_version(pa.__version__) < (12,)) + or ("cudf" in str(constructor)) ): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and parse_version(pl.__version__) < (0, 20, 7): diff --git a/tests/expr_and_series/replace_time_zone_test.py b/tests/expr_and_series/replace_time_zone_test.py index 560fcfe84..4954875f0 100644 --- a/tests/expr_and_series/replace_time_zone_test.py +++ b/tests/expr_and_series/replace_time_zone_test.py @@ -20,6 +20,7 @@ def test_replace_time_zone( (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) or ("pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2,)) or ("pyarrow_table" in str(constructor) and parse_version(pa.__version__) < (12,)) + or ("cudf" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { @@ -76,6 +77,7 @@ def test_replace_time_zone_series( "pyarrow_table" in str(constructor_eager) and parse_version(pa.__version__) < (12,) ) + or ("cudf" in str(constructor_eager)) ): request.applymarker(pytest.mark.xfail) data = { diff --git a/tests/series_only/to_dummy_test.py b/tests/series_only/to_dummy_test.py index c3d57b9ad..2cf7f59c7 100644 --- a/tests/series_only/to_dummy_test.py +++ b/tests/series_only/to_dummy_test.py @@ -18,11 +18,7 @@ def test_to_dummies(constructor_eager: Any, sep: str) -> None: @pytest.mark.parametrize("sep", ["_", "-"]) -def test_to_dummies_drop_first( - request: pytest.FixtureRequest, constructor_eager: Any, sep: str -) -> None: - if "cudf" in str(constructor_eager): - request.applymarker(pytest.mark.xfail) +def test_to_dummies_drop_first(constructor_eager: Any, sep: str) -> None: s = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"].alias("a") result = s.to_dummies(drop_first=True, separator=sep) expected = {f"a{sep}2": [0, 1, 0], f"a{sep}3": [0, 0, 1]} From 2a406cf6a7908513f60df89c192fca1d06690e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Thu, 17 Oct 2024 12:03:33 +0200 Subject: [PATCH 123/145] feat: Adding allow inspecting inner fields of nw.Struct (#1192) --- docs/api-reference/dtypes.md | 3 +- narwhals/__init__.py | 2 + narwhals/_arrow/utils.py | 11 ++++- narwhals/_duckdb/dataframe.py | 11 ++++- narwhals/_ibis/dataframe.py | 10 +++- narwhals/_pandas_like/utils.py | 2 +- narwhals/_polars/utils.py | 7 ++- narwhals/dtypes.py | 84 +++++++++++++++++++++++++++++++++- narwhals/stable/v1/__init__.py | 2 + narwhals/stable/v1/_dtypes.py | 2 + narwhals/stable/v1/dtypes.py | 2 + narwhals/stable/v1/typing.py | 1 + narwhals/typing.py | 1 + tests/dtypes_test.py | 37 +++++++++++++++ tests/frame/schema_test.py | 36 +++++++++++---- utils/check_api_reference.py | 9 +++- 16 files changed, 203 insertions(+), 17 deletions(-) diff --git a/docs/api-reference/dtypes.md b/docs/api-reference/dtypes.md index eb96608a6..77bf1266b 100644 --- a/docs/api-reference/dtypes.md +++ b/docs/api-reference/dtypes.md @@ -6,7 +6,6 @@ members: - Array - List - - Struct - Int64 - Int32 - Int16 @@ -15,12 +14,14 @@ - UInt32 - UInt16 - UInt8 + - Field - Float64 - Float32 - Boolean - Categorical - Enum - String + - Struct - Date - Datetime - Duration diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 124f10c45..3a327aad4 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -10,6 +10,7 @@ from narwhals.dtypes import Datetime from narwhals.dtypes import Duration from narwhals.dtypes import Enum +from narwhals.dtypes import Field from narwhals.dtypes import Float32 from narwhals.dtypes import Float64 from narwhals.dtypes import Int8 @@ -118,6 +119,7 @@ "String", "Datetime", "Duration", + "Field", "Struct", "Array", "List", diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index e37cb093f..7f6fa6558 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -56,7 +56,16 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: if pa.types.is_dictionary(dtype): return dtypes.Categorical() if pa.types.is_struct(dtype): - return dtypes.Struct() + return dtypes.Struct( + [ + dtypes.Field( + dtype.field(i).name, + native_to_narwhals_dtype(dtype.field(i).type, dtypes), + ) + for i in range(dtype.num_fields) + ] + ) + if pa.types.is_list(dtype) or pa.types.is_large_list(dtype): return dtypes.List(native_to_narwhals_dtype(dtype.value_type, dtypes)) if pa.types.is_fixed_size_list(dtype): diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 5877ed51e..82ac6d41b 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -52,7 +52,16 @@ def map_duckdb_dtype_to_narwhals_dtype(duckdb_dtype: Any, dtypes: DTypes) -> DTy if duckdb_dtype == "INTERVAL": return dtypes.Duration() if duckdb_dtype.startswith("STRUCT"): - return dtypes.Struct() + matchstruc_ = re.findall(r"(\w+)\s+(\w+)", duckdb_dtype) + return dtypes.Struct( + [ + dtypes.Field( + matchstruc_[i][0], + map_duckdb_dtype_to_narwhals_dtype(matchstruc_[i][1], dtypes), + ) + for i in range(len(matchstruc_)) + ] + ) if match_ := re.match(r"(.*)\[\]$", duckdb_dtype): return dtypes.List(map_duckdb_dtype_to_narwhals_dtype(match_.group(1), dtypes)) if match_ := re.match(r"(\w+)\[(\d+)\]", duckdb_dtype): diff --git a/narwhals/_ibis/dataframe.py b/narwhals/_ibis/dataframe.py index 9d7ebefb0..a9c3a49fa 100644 --- a/narwhals/_ibis/dataframe.py +++ b/narwhals/_ibis/dataframe.py @@ -51,7 +51,15 @@ def map_ibis_dtype_to_narwhals_dtype(ibis_dtype: Any, dtypes: DTypes) -> DType: map_ibis_dtype_to_narwhals_dtype(ibis_dtype.value_type, dtypes) ) if ibis_dtype.is_struct(): - return dtypes.Struct() + return dtypes.Struct( + [ + dtypes.Field( + ibis_dtype_name, + map_ibis_dtype_to_narwhals_dtype(ibis_dtype_field, dtypes), + ) + for ibis_dtype_name, ibis_dtype_field in ibis_dtype.items() + ] + ) return dtypes.Unknown() # pragma: no cover diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 381a78c8d..0773764d9 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -294,7 +294,7 @@ def native_to_narwhals_dtype(native_column: Any, dtypes: DTypes) -> DType: native_column.dtype.pyarrow_dtype.list_size, ) if dtype.startswith("struct"): - return dtypes.Struct() + return arrow_native_to_narwhals_dtype(native_column.dtype.pyarrow_dtype, dtypes) if dtype == "object": if ( # pragma: no cover TODO(unassigned): why does this show as uncovered? idx := getattr(native_column, "first_valid_index", lambda: None)() diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index d44535cc7..fe63f515f 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -75,7 +75,12 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") return dtypes.Duration(time_unit=du_time_unit) if dtype == pl.Struct: - return dtypes.Struct() + return dtypes.Struct( + [ + dtypes.Field(field_name, native_to_narwhals_dtype(field_type, dtypes)) + for field_name, field_type in dtype + ] + ) if dtype == pl.List: return dtypes.List(native_to_narwhals_dtype(dtype.inner, dtypes)) if dtype == pl.Array: diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index 98d8c6914..73a77af1f 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -1,10 +1,15 @@ from __future__ import annotations +from collections import OrderedDict from datetime import timezone from typing import TYPE_CHECKING -from typing import Literal +from typing import Mapping if TYPE_CHECKING: + from typing import Iterator + from typing import Literal + from typing import Sequence + from typing_extensions import Self @@ -170,7 +175,82 @@ class Categorical(DType): ... class Enum(DType): ... -class Struct(DType): ... +class Field: + """ + Definition of a single field within a `Struct` DataType. + + Arguments: + name: The name of the field within its parent `Struct`. + dtype: The `DataType` of the field's values. + + """ + + name: str + dtype: type[DType] | DType + + def __init__(self, name: str, dtype: type[DType] | DType) -> None: + self.name = name + self.dtype = dtype + + def __eq__(self, other: Field) -> bool: # type: ignore[override] + return (self.name == other.name) & (self.dtype == other.dtype) + + def __hash__(self) -> int: + return hash((self.name, self.dtype)) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + return f"{class_name}({self.name!r}, {self.dtype})" + + +class Struct(DType): + """ + Struct composite type. + + Arguments: + fields: The fields that make up the struct. Can be either a sequence of Field objects or a mapping of column names to data types. + """ + + fields: list[Field] + + def __init__( + self, fields: Sequence[Field] | Mapping[str, DType | type[DType]] + ) -> None: + if isinstance(fields, Mapping): + self.fields = [Field(name, dtype) for name, dtype in fields.items()] + else: + self.fields = list(fields) + + def __eq__(self, other: DType | type[DType]) -> bool: # type: ignore[override] + # The comparison allows comparing objects to classes, and specific + # inner types to those without (eg: inner=None). if one of the + # arguments is not specific about its inner type we infer it + # as being equal. (See the List type for more info). + if type(other) is type and issubclass(other, self.__class__): + return True + elif isinstance(other, self.__class__): + return self.fields == other.fields + else: + return False + + def __hash__(self) -> int: + return hash((self.__class__, tuple(self.fields))) + + def __iter__(self) -> Iterator[tuple[str, DType | type[DType]]]: + for fld in self.fields: + yield fld.name, fld.dtype + + def __reversed__(self) -> Iterator[tuple[str, DType | type[DType]]]: + for fld in reversed(self.fields): + yield fld.name, fld.dtype + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + return f"{class_name}({dict(self)})" + + def to_schema(self) -> OrderedDict[str, DType | type[DType]]: + """Return Struct dtype as a schema dict.""" + return OrderedDict(self) class List(DType): diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 86ddd1def..93406a145 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -31,6 +31,7 @@ from narwhals.stable.v1.dtypes import Datetime from narwhals.stable.v1.dtypes import Duration from narwhals.stable.v1.dtypes import Enum +from narwhals.stable.v1.dtypes import Field from narwhals.stable.v1.dtypes import Float32 from narwhals.stable.v1.dtypes import Float64 from narwhals.stable.v1.dtypes import Int8 @@ -2296,6 +2297,7 @@ def from_dict( "String", "Datetime", "Duration", + "Field", "Struct", "Array", "List", diff --git a/narwhals/stable/v1/_dtypes.py b/narwhals/stable/v1/_dtypes.py index 13dd3237d..84c9adc90 100644 --- a/narwhals/stable/v1/_dtypes.py +++ b/narwhals/stable/v1/_dtypes.py @@ -6,6 +6,7 @@ from narwhals.dtypes import DType from narwhals.dtypes import Duration as NwDuration from narwhals.dtypes import Enum +from narwhals.dtypes import Field from narwhals.dtypes import Float32 from narwhals.dtypes import Float64 from narwhals.dtypes import Int8 @@ -77,6 +78,7 @@ def __hash__(self) -> int: "NumericType", "Object", "String", + "Field", "Struct", "UInt8", "UInt16", diff --git a/narwhals/stable/v1/dtypes.py b/narwhals/stable/v1/dtypes.py index f36da9725..21bd1c5ed 100644 --- a/narwhals/stable/v1/dtypes.py +++ b/narwhals/stable/v1/dtypes.py @@ -6,6 +6,7 @@ from narwhals.stable.v1._dtypes import DType from narwhals.stable.v1._dtypes import Duration from narwhals.stable.v1._dtypes import Enum +from narwhals.stable.v1._dtypes import Field from narwhals.stable.v1._dtypes import Float32 from narwhals.stable.v1._dtypes import Float64 from narwhals.stable.v1._dtypes import Int8 @@ -34,6 +35,7 @@ "Enum", "Float32", "Float64", + "Field", "Int8", "Int16", "Int32", diff --git a/narwhals/stable/v1/typing.py b/narwhals/stable/v1/typing.py index e8ab9e1ae..aebe78fc7 100644 --- a/narwhals/stable/v1/typing.py +++ b/narwhals/stable/v1/typing.py @@ -73,6 +73,7 @@ class DTypes: Datetime: type[dtypes.Datetime] Duration: type[dtypes.Duration] Date: type[dtypes.Date] + Field: type[dtypes.Field] Struct: type[dtypes.Struct] List: type[dtypes.List] Array: type[dtypes.Array] diff --git a/narwhals/typing.py b/narwhals/typing.py index 30de0a097..8fcbc697c 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -73,6 +73,7 @@ class DTypes: Datetime: type[dtypes.Datetime] Duration: type[dtypes.Duration] Date: type[dtypes.Date] + Field: type[dtypes.Field] Struct: type[dtypes.Struct] List: type[dtypes.List] Array: type[dtypes.Array] diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py index c35507873..b2006f6c1 100644 --- a/tests/dtypes_test.py +++ b/tests/dtypes_test.py @@ -87,6 +87,43 @@ def test_array_valid() -> None: dtype = nw.Array(nw.Int64) +def test_struct_valid() -> None: + dtype = nw.Struct([nw.Field("a", nw.Int64)]) + assert dtype == nw.Struct([nw.Field("a", nw.Int64)]) + assert dtype == nw.Struct + assert dtype != nw.Struct([nw.Field("a", nw.Float32)]) + assert dtype != nw.Duration + assert repr(dtype) == "Struct({'a': })" + + dtype = nw.Struct({"a": nw.Int64, "b": nw.String}) + assert dtype == nw.Struct({"a": nw.Int64, "b": nw.String}) + assert dtype.to_schema() == nw.Struct({"a": nw.Int64, "b": nw.String}).to_schema() + assert dtype == nw.Struct + assert dtype != nw.Struct({"a": nw.Int32, "b": nw.String}) + assert dtype in {nw.Struct({"a": nw.Int64, "b": nw.String})} + + +def test_struct_reverse() -> None: + dtype1 = nw.Struct({"a": nw.Int64, "b": nw.String}) + dtype1_reversed = nw.Struct([nw.Field(*field) for field in reversed(dtype1)]) + dtype2 = nw.Struct({"b": nw.String, "a": nw.Int64}) + assert dtype1_reversed == dtype2 + + +def test_field_repr() -> None: + dtype = nw.Field("a", nw.Int32) + assert repr(dtype) == "Field('a', )" + + +def test_struct_hashes() -> None: + dtypes = ( + nw.Struct, + nw.Struct([nw.Field("a", nw.Int64)]), + nw.Struct([nw.Field("a", nw.Int64), nw.Field("b", nw.List(nw.Int64))]), + ) + assert len({hash(tp) for tp in (dtypes)}) == 3 + + @pytest.mark.skipif( parse_version(pl.__version__) < (1,) or parse_version(pd.__version__) < (2, 2), reason="`shape` is only available after 1.0", diff --git a/tests/frame/schema_test.py b/tests/frame/schema_test.py index cb5ddff19..8e1116997 100644 --- a/tests/frame/schema_test.py +++ b/tests/frame/schema_test.py @@ -213,23 +213,39 @@ def test_nested_dtypes() -> None: schema_overrides={"b": pl.Array(pl.Int64, 2)}, ).to_pandas(use_pyarrow_extension_array=True) nwdf = nw.from_native(df) - - assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} + assert nwdf.schema == { + "a": nw.List(nw.Int64), + "b": nw.Array(nw.Int64, 2), + "c": nw.Struct({"a": nw.Int64}), + } df = pl.DataFrame( {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]}, schema_overrides={"b": pl.Array(pl.Int64, 2)}, ) nwdf = nw.from_native(df) - assert nwdf.schema == {"a": nw.List, "b": nw.Array(nw.Int64, 2), "c": nw.Struct} + assert nwdf.schema == { + "a": nw.List(nw.Int64), + "b": nw.Array(nw.Int64, 2), + "c": nw.Struct({"a": nw.Int64}), + } + df = pl.DataFrame( - {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]}, + {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1, "b": "x", "c": 1.1}]}, schema_overrides={"b": pl.Array(pl.Int64, 2)}, ).to_arrow() nwdf = nw.from_native(df) - assert nwdf.schema == {"a": nw.List, "b": nw.Array(nw.Int64, 2), "c": nw.Struct} + assert nwdf.schema == { + "a": nw.List(nw.Int64), + "b": nw.Array(nw.Int64, 2), + "c": nw.Struct({"a": nw.Int64, "b": nw.String, "c": nw.Float64}), + } df = duckdb.sql("select * from df") nwdf = nw.from_native(df) - assert nwdf.schema == {"a": nw.List, "b": nw.Array(nw.Int64, 2), "c": nw.Struct} + assert nwdf.schema == { + "a": nw.List(nw.Int64), + "b": nw.Array(nw.Int64, 2), + "c": nw.Struct({"a": nw.Int64, "b": nw.String, "c": nw.Float64}), + } def test_nested_dtypes_ibis() -> None: # pragma: no cover @@ -240,7 +256,7 @@ def test_nested_dtypes_ibis() -> None: # pragma: no cover ) tbl = ibis.memtable(df[["a", "c"]]) nwdf = nw.from_native(tbl) - assert nwdf.schema == {"a": nw.List, "c": nw.Struct} + assert nwdf.schema == {"a": nw.List(nw.Int64), "c": nw.Struct({"a": nw.Int64})} @pytest.mark.skipif( @@ -259,4 +275,8 @@ def test_nested_dtypes_dask() -> None: ).to_pandas(use_pyarrow_extension_array=True) ) nwdf = nw.from_native(df) - assert nwdf.schema == {"a": nw.List, "b": nw.Array, "c": nw.Struct} + assert nwdf.schema == { + "a": nw.List(nw.Int64), + "b": nw.Array(nw.Int64, 2), + "c": nw.Struct({"a": nw.Int64}), + } diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 69c310439..e3aa0fb91 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -31,7 +31,14 @@ "zip_with", "__iter__", } -BASE_DTYPES = {"NumericType", "DType", "TemporalType", "Literal"} +BASE_DTYPES = { + "NumericType", + "DType", + "TemporalType", + "Literal", + "OrderedDict", + "Mapping", +} files = {remove_suffix(i, ".py") for i in os.listdir("narwhals")} From 58973a0147b1ff6e58e97704ae668be4fa40034a Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Thu, 17 Oct 2024 11:08:18 +0100 Subject: [PATCH 124/145] docs: Update CONTRIBUTING.md with Python 3.12 (#1197) * Update CONTRIBUTING.md with Python 3.12 * change counting * change counting --- CONTRIBUTING.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a30273970..c7d7c44a0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -51,17 +51,20 @@ Here's how you can set up your local development environment to contribute. #### Option 1: Use UV (recommended) -1. Make sure you have Python3.8+ installed (for example, Python 3.11), create a virtual environment, +1. Make sure you have Python3.12 installed, create a virtual environment, and activate it. If you're new to this, here's one way that we recommend: 1. Install uv: https://github.com/astral-sh/uv?tab=readme-ov-file#getting-started - 2. Install some version of Python greater than Python3.8. For example, to install - Python3.11: + or make sure it is up-to-date with: ``` - uv python install 3.11 + uv self update + ``` + 2. Install Python3.12: + ``` + uv python install 3.12 ``` 3. Create a virtual environment: ``` - uv venv -p 3.11 --seed + uv venv -p 3.12 --seed ``` 4. Activate it. On Linux, this is `. .venv/bin/activate`, on Windows `.\.venv\Scripts\activate`. 2. Install Narwhals: `uv pip install -e .` From 82b1d6e64471a3016aa68f06ee147f4fc502411e Mon Sep 17 00:00:00 2001 From: Luciano <66913960+lucianosrp@users.noreply.github.com> Date: Thu, 17 Oct 2024 12:56:22 +0200 Subject: [PATCH 125/145] feat: add ConstructorEager type (#1091) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- tests/expr_and_series/abs_test.py | 5 ++- tests/expr_and_series/all_horizontal_test.py | 3 +- tests/expr_and_series/any_all_test.py | 5 ++- tests/expr_and_series/arg_true_test.py | 5 ++- tests/expr_and_series/arithmetic_test.py | 7 ++-- .../cat/get_categories_test.py | 7 ++-- tests/expr_and_series/clip_test.py | 5 ++- .../expr_and_series/convert_time_zone_test.py | 11 +++++-- tests/expr_and_series/count_test.py | 5 ++- tests/expr_and_series/cum_sum_test.py | 5 ++- tests/expr_and_series/diff_test.py | 7 ++-- tests/expr_and_series/drop_nulls_test.py | 5 ++- .../dt/datetime_attributes_test.py | 6 ++-- .../dt/datetime_duration_test.py | 4 +-- tests/expr_and_series/dt/to_string_test.py | 7 ++-- tests/expr_and_series/fill_null_test.py | 5 ++- tests/expr_and_series/filter_test.py | 5 ++- tests/expr_and_series/gather_every_test.py | 7 ++-- tests/expr_and_series/head_test.py | 5 ++- tests/expr_and_series/is_between_test.py | 5 ++- tests/expr_and_series/is_duplicated_test.py | 5 ++- .../expr_and_series/is_first_distinct_test.py | 5 ++- tests/expr_and_series/is_in_test.py | 5 ++- .../expr_and_series/is_last_distinct_test.py | 5 ++- tests/expr_and_series/is_null_test.py | 5 ++- tests/expr_and_series/is_unique_test.py | 5 ++- tests/expr_and_series/len_test.py | 5 ++- tests/expr_and_series/max_test.py | 7 ++-- tests/expr_and_series/mean_test.py | 7 ++-- tests/expr_and_series/min_test.py | 7 ++-- tests/expr_and_series/mode_test.py | 5 ++- tests/expr_and_series/n_unique_test.py | 5 ++- tests/expr_and_series/null_count_test.py | 5 ++- tests/expr_and_series/operators_test.py | 9 +++-- tests/expr_and_series/pipe_test.py | 5 ++- tests/expr_and_series/quantile_test.py | 4 +-- .../expr_and_series/replace_time_zone_test.py | 11 +++++-- tests/expr_and_series/round_test.py | 5 ++- tests/expr_and_series/shift_test.py | 5 ++- tests/expr_and_series/sort_test.py | 16 ++++----- tests/expr_and_series/std_test.py | 5 ++- tests/expr_and_series/str/contains_test.py | 7 ++-- tests/expr_and_series/str/head_test.py | 5 ++- tests/expr_and_series/str/len_chars_test.py | 5 ++- tests/expr_and_series/str/replace_test.py | 7 ++-- tests/expr_and_series/str/slice_test.py | 3 +- .../str/starts_with_ends_with_test.py | 7 ++-- tests/expr_and_series/str/strip_chars_test.py | 3 +- tests/expr_and_series/str/tail_test.py | 5 ++- tests/expr_and_series/str/to_datetime_test.py | 12 ++++--- .../str/to_uppercase_to_lowercase_test.py | 7 ++-- tests/expr_and_series/sum_test.py | 7 ++-- tests/expr_and_series/tail_test.py | 5 ++- tests/expr_and_series/unary_test.py | 5 ++- tests/expr_and_series/unique_test.py | 5 ++- tests/expr_and_series/when_test.py | 7 ++-- tests/frame/array_dunder_test.py | 11 ++++--- tests/frame/get_column_test.py | 5 ++- tests/frame/getitem_test.py | 33 +++++++++---------- tests/frame/is_duplicated_test.py | 5 ++- tests/frame/is_empty_test.py | 8 ++++- tests/frame/is_unique_test.py | 5 ++- tests/frame/item_test.py | 11 +++++-- tests/frame/lazy_test.py | 5 ++- tests/frame/len_test.py | 6 ++-- tests/frame/null_count_test.py | 5 ++- tests/frame/row_test.py | 3 +- tests/frame/rows_test.py | 6 +++- tests/frame/schema_test.py | 5 ++- tests/frame/shape_test.py | 5 ++- tests/frame/to_arrow_test.py | 9 +++-- tests/frame/to_dict_test.py | 7 ++-- tests/frame/to_native_test.py | 5 ++- tests/frame/to_numpy_test.py | 7 ++-- tests/frame/to_pandas_test.py | 9 +++-- tests/frame/write_csv_test.py | 6 ++-- tests/frame/write_parquet_test.py | 9 +++-- tests/group_by_test.py | 4 +-- tests/new_series_test.py | 7 ++-- tests/series_only/__iter___test.py | 9 +++-- tests/series_only/array_dunder_test.py | 11 ++++--- tests/series_only/dtype_test.py | 7 ++-- tests/series_only/is_empty_test.py | 5 ++- .../is_ordered_categorical_test.py | 5 ++- tests/series_only/is_sorted_test.py | 7 ++-- tests/series_only/item_test.py | 4 +-- tests/series_only/scatter_test.py | 11 ++++--- tests/series_only/shape_test.py | 5 ++- tests/series_only/slice_test.py | 5 ++- tests/series_only/to_arrow_test.py | 9 +++-- tests/series_only/to_dummy_test.py | 7 ++-- tests/series_only/to_frame_test.py | 5 ++- tests/series_only/to_list_test.py | 7 ++-- tests/series_only/to_native_test.py | 9 +++-- tests/series_only/to_numpy_test.py | 9 +++-- tests/series_only/to_pandas_test.py | 9 +++-- tests/series_only/value_counts_test.py | 3 +- tests/series_only/zip_with_test.py | 7 ++-- tests/translate/to_native_test.py | 3 +- tests/utils.py | 4 ++- 100 files changed, 348 insertions(+), 304 deletions(-) diff --git a/tests/expr_and_series/abs_test.py b/tests/expr_and_series/abs_test.py index 286bcca19..c883d7161 100644 --- a/tests/expr_and_series/abs_test.py +++ b/tests/expr_and_series/abs_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -12,7 +11,7 @@ def test_abs(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_abs_series(constructor_eager: Any) -> None: +def test_abs_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager({"a": [1, 2, 3, -4, 5]}), eager_only=True) result = {"b": df["a"].abs()} expected = {"b": [1, 2, 3, 4, 5]} diff --git a/tests/expr_and_series/all_horizontal_test.py b/tests/expr_and_series/all_horizontal_test.py index 01d53fe63..a5ba44600 100644 --- a/tests/expr_and_series/all_horizontal_test.py +++ b/tests/expr_and_series/all_horizontal_test.py @@ -6,6 +6,7 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -23,7 +24,7 @@ def test_allh(constructor: Constructor, expr1: Any, expr2: Any) -> None: compare_dicts(result, expected) -def test_allh_series(constructor_eager: Any) -> None: +def test_allh_series(constructor_eager: ConstructorEager) -> None: data = { "a": [False, False, True], "b": [False, True, True], diff --git a/tests/expr_and_series/any_all_test.py b/tests/expr_and_series/any_all_test.py index 834a91202..73294c708 100644 --- a/tests/expr_and_series/any_all_test.py +++ b/tests/expr_and_series/any_all_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -23,7 +22,7 @@ def test_any_all(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_any_all_series(constructor_eager: Any) -> None: +def test_any_all_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native( constructor_eager( { diff --git a/tests/expr_and_series/arg_true_test.py b/tests/expr_and_series/arg_true_test.py index 7e1262aa8..1f71e2c42 100644 --- a/tests/expr_and_series/arg_true_test.py +++ b/tests/expr_and_series/arg_true_test.py @@ -1,9 +1,8 @@ -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -16,7 +15,7 @@ def test_arg_true(constructor: Constructor, request: pytest.FixtureRequest) -> N compare_dicts(result, expected) -def test_arg_true_series(constructor_eager: Any) -> None: +def test_arg_true_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager({"a": [1, None, None, 3]}), eager_only=True) result = df.select(df["a"].is_null().arg_true()) expected = {"a": [1, 2]} diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py index e431aebbe..eb283667f 100644 --- a/tests/expr_and_series/arithmetic_test.py +++ b/tests/expr_and_series/arithmetic_test.py @@ -13,6 +13,7 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -94,7 +95,7 @@ def test_arithmetic_series( attr: str, rhs: Any, expected: list[Any], - constructor_eager: Any, + constructor_eager: ConstructorEager, request: pytest.FixtureRequest, ) -> None: if attr == "__mod__" and any( @@ -124,7 +125,7 @@ def test_right_arithmetic_series( attr: str, rhs: Any, expected: list[Any], - constructor_eager: Any, + constructor_eager: ConstructorEager, request: pytest.FixtureRequest, ) -> None: if attr == "__rmod__" and any( @@ -139,7 +140,7 @@ def test_right_arithmetic_series( def test_truediv_same_dims( - constructor_eager: Any, request: pytest.FixtureRequest + constructor_eager: ConstructorEager, request: pytest.FixtureRequest ) -> None: if "polars" in str(constructor_eager): # https://github.com/pola-rs/polars/issues/17760 diff --git a/tests/expr_and_series/cat/get_categories_test.py b/tests/expr_and_series/cat/get_categories_test.py index 122f3c83e..11ba3ee58 100644 --- a/tests/expr_and_series/cat/get_categories_test.py +++ b/tests/expr_and_series/cat/get_categories_test.py @@ -1,18 +1,19 @@ from __future__ import annotations -from typing import Any - import pyarrow as pa import pytest import narwhals.stable.v1 as nw from narwhals.utils import parse_version +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": ["one", "two", "two"]} -def test_get_categories(request: pytest.FixtureRequest, constructor_eager: Any) -> None: +def test_get_categories( + request: pytest.FixtureRequest, constructor_eager: ConstructorEager +) -> None: if "pyarrow_table" in str(constructor_eager) and parse_version( pa.__version__ ) < parse_version("15.0.0"): diff --git a/tests/expr_and_series/clip_test.py b/tests/expr_and_series/clip_test.py index d3f90633c..2406f289f 100644 --- a/tests/expr_and_series/clip_test.py +++ b/tests/expr_and_series/clip_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -20,7 +19,7 @@ def test_clip(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_clip_series(constructor_eager: Any) -> None: +def test_clip_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager({"a": [1, 2, 3, -4, 5]}), eager_only=True) result = { "lower_only": df["a"].clip(lower_bound=3), diff --git a/tests/expr_and_series/convert_time_zone_test.py b/tests/expr_and_series/convert_time_zone_test.py index d85637a14..7914c8b56 100644 --- a/tests/expr_and_series/convert_time_zone_test.py +++ b/tests/expr_and_series/convert_time_zone_test.py @@ -1,6 +1,8 @@ +from __future__ import annotations + from datetime import datetime from datetime import timezone -from typing import Any +from typing import TYPE_CHECKING import pandas as pd import polars as pl @@ -13,6 +15,9 @@ from tests.utils import compare_dicts from tests.utils import is_windows +if TYPE_CHECKING: + from tests.utils import ConstructorEager + def test_convert_time_zone( constructor: Constructor, request: pytest.FixtureRequest @@ -43,7 +48,7 @@ def test_convert_time_zone( def test_convert_time_zone_series( - constructor_eager: Any, request: pytest.FixtureRequest + constructor_eager: ConstructorEager, request: pytest.FixtureRequest ) -> None: if ( (any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows()) @@ -116,7 +121,7 @@ def test_convert_time_zone_to_none(constructor: Constructor) -> None: df.select(nw.col("a").dt.convert_time_zone(None)) # type: ignore[arg-type] -def test_convert_time_zone_to_none_series(constructor_eager: Any) -> None: +def test_convert_time_zone_to_none_series(constructor_eager: ConstructorEager) -> None: data = { "a": [ datetime(2020, 1, 1, tzinfo=timezone.utc), diff --git a/tests/expr_and_series/count_test.py b/tests/expr_and_series/count_test.py index 580bd202b..ec90e1fc1 100644 --- a/tests/expr_and_series/count_test.py +++ b/tests/expr_and_series/count_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -13,7 +12,7 @@ def test_count(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_count_series(constructor_eager: Any) -> None: +def test_count_series(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 3, 2], "b": [4, None, 6], "z": [7.0, None, None]} df = nw.from_native(constructor_eager(data), eager_only=True) result = {"a": [df["a"].count()], "b": [df["b"].count()], "z": [df["z"].count()]} diff --git a/tests/expr_and_series/cum_sum_test.py b/tests/expr_and_series/cum_sum_test.py index 94897a850..a490b890e 100644 --- a/tests/expr_and_series/cum_sum_test.py +++ b/tests/expr_and_series/cum_sum_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -22,7 +21,7 @@ def test_cum_sum_simple(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_cum_sum_simple_series(constructor_eager: Any) -> None: +def test_cum_sum_simple_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) expected = { "a": [0, 1, 3, 6, 10], diff --git a/tests/expr_and_series/diff_test.py b/tests/expr_and_series/diff_test.py index 33445f763..ada3147ed 100644 --- a/tests/expr_and_series/diff_test.py +++ b/tests/expr_and_series/diff_test.py @@ -1,11 +1,10 @@ -from typing import Any - import pyarrow as pa import pytest import narwhals.stable.v1 as nw from narwhals.utils import parse_version from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -32,7 +31,9 @@ def test_diff(constructor: Constructor, request: pytest.FixtureRequest) -> None: compare_dicts(result, expected) -def test_diff_series(constructor_eager: Any, request: pytest.FixtureRequest) -> None: +def test_diff_series( + constructor_eager: ConstructorEager, request: pytest.FixtureRequest +) -> None: if "pyarrow_table_constructor" in str(constructor_eager) and parse_version( pa.__version__ ) < (13,): diff --git a/tests/expr_and_series/drop_nulls_test.py b/tests/expr_and_series/drop_nulls_test.py index bc06eec3a..4b15416ce 100644 --- a/tests/expr_and_series/drop_nulls_test.py +++ b/tests/expr_and_series/drop_nulls_test.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -36,7 +35,7 @@ def test_drop_nulls(constructor: Constructor, request: pytest.FixtureRequest) -> compare_dicts(result_d, expected_d) -def test_drop_nulls_series(constructor_eager: Any) -> None: +def test_drop_nulls_series(constructor_eager: ConstructorEager) -> None: data = { "A": [1, 2, None, 4], "B": [5, 6, 7, 8], diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index 5b9519f57..757d226ff 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -2,12 +2,12 @@ from datetime import date from datetime import datetime -from typing import Any import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -72,7 +72,7 @@ def test_datetime_attributes( ) def test_datetime_attributes_series( request: pytest.FixtureRequest, - constructor_eager: Any, + constructor_eager: ConstructorEager, attribute: str, expected: list[int], ) -> None: @@ -91,7 +91,7 @@ def test_datetime_attributes_series( def test_datetime_chained_attributes( - request: pytest.FixtureRequest, constructor_eager: Any + request: pytest.FixtureRequest, constructor_eager: ConstructorEager ) -> None: if "pandas" in str(constructor_eager) and "pyarrow" not in str(constructor_eager): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/dt/datetime_duration_test.py b/tests/expr_and_series/dt/datetime_duration_test.py index da5ff325b..3e4894b0b 100644 --- a/tests/expr_and_series/dt/datetime_duration_test.py +++ b/tests/expr_and_series/dt/datetime_duration_test.py @@ -1,7 +1,6 @@ from __future__ import annotations from datetime import timedelta -from typing import Any import numpy as np import pandas as pd @@ -12,6 +11,7 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -74,7 +74,7 @@ def test_duration_attributes( ) def test_duration_attributes_series( request: pytest.FixtureRequest, - constructor_eager: Any, + constructor_eager: ConstructorEager, attribute: str, expected_a: list[int], expected_b: list[int], diff --git a/tests/expr_and_series/dt/to_string_test.py b/tests/expr_and_series/dt/to_string_test.py index 6017c33d2..a6261b78a 100644 --- a/tests/expr_and_series/dt/to_string_test.py +++ b/tests/expr_and_series/dt/to_string_test.py @@ -7,6 +7,7 @@ import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts from tests.utils import is_windows @@ -29,7 +30,7 @@ ], ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") -def test_dt_to_string_series(constructor_eager: Any, fmt: str) -> None: +def test_dt_to_string_series(constructor_eager: ConstructorEager, fmt: str) -> None: input_frame = nw.from_native(constructor_eager(data), eager_only=True) input_series = input_frame["a"] @@ -100,7 +101,7 @@ def _clean_string_expr(e: Any) -> Any: ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") def test_dt_to_string_iso_local_datetime_series( - constructor_eager: Any, data: datetime, expected: str + constructor_eager: ConstructorEager, data: datetime, expected: str ) -> None: df = constructor_eager({"a": [data]}) result = ( @@ -152,7 +153,7 @@ def test_dt_to_string_iso_local_datetime_expr( ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") def test_dt_to_string_iso_local_date_series( - constructor_eager: Any, data: datetime, expected: str + constructor_eager: ConstructorEager, data: datetime, expected: str ) -> None: df = constructor_eager({"a": [data]}) result = nw.from_native(df, eager_only=True)["a"].dt.to_string("%Y-%m-%d").item(0) diff --git a/tests/expr_and_series/fill_null_test.py b/tests/expr_and_series/fill_null_test.py index 6efde5ac0..9fa7afaf9 100644 --- a/tests/expr_and_series/fill_null_test.py +++ b/tests/expr_and_series/fill_null_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -23,7 +22,7 @@ def test_fill_null(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_fill_null_series(constructor_eager: Any) -> None: +def test_fill_null_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) expected = { diff --git a/tests/expr_and_series/filter_test.py b/tests/expr_and_series/filter_test.py index 80267d1d0..dff987ecb 100644 --- a/tests/expr_and_series/filter_test.py +++ b/tests/expr_and_series/filter_test.py @@ -1,9 +1,8 @@ -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -23,7 +22,7 @@ def test_filter(constructor: Constructor, request: pytest.FixtureRequest) -> Non compare_dicts(result, expected) -def test_filter_series(constructor_eager: Any) -> None: +def test_filter_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) result = df.select(df["a"].filter((df["i"] < 2) & (df["c"] == 5))) expected = {"a": [0]} diff --git a/tests/expr_and_series/gather_every_test.py b/tests/expr_and_series/gather_every_test.py index e01294ef9..e6f68be1d 100644 --- a/tests/expr_and_series/gather_every_test.py +++ b/tests/expr_and_series/gather_every_test.py @@ -1,9 +1,8 @@ -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": list(range(10))} @@ -26,7 +25,9 @@ def test_gather_every_expr( @pytest.mark.parametrize("n", [1, 2, 3]) @pytest.mark.parametrize("offset", [1, 2, 3]) -def test_gather_every_series(constructor_eager: Any, n: int, offset: int) -> None: +def test_gather_every_series( + constructor_eager: ConstructorEager, n: int, offset: int +) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)["a"] result = series.gather_every(n=n, offset=offset) diff --git a/tests/expr_and_series/head_test.py b/tests/expr_and_series/head_test.py index 2a6326921..4c750fabf 100644 --- a/tests/expr_and_series/head_test.py +++ b/tests/expr_and_series/head_test.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -22,7 +21,7 @@ def test_head(constructor: Constructor, n: int, request: pytest.FixtureRequest) @pytest.mark.parametrize("n", [2, -1]) -def test_head_series(constructor_eager: Any, n: int) -> None: +def test_head_series(constructor_eager: ConstructorEager, n: int) -> None: df = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True) result = df.select(df["a"].head(n)) expected = {"a": [1, 2]} diff --git a/tests/expr_and_series/is_between_test.py b/tests/expr_and_series/is_between_test.py index 0a9e578ea..0550498b6 100644 --- a/tests/expr_and_series/is_between_test.py +++ b/tests/expr_and_series/is_between_test.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -39,7 +38,7 @@ def test_is_between(constructor: Constructor, closed: str, expected: list[bool]) ], ) def test_is_between_series( - constructor_eager: Any, closed: str, expected: list[bool] + constructor_eager: ConstructorEager, closed: str, expected: list[bool] ) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) result = df.with_columns(a=df["a"].is_between(1, 5, closed=closed)) diff --git a/tests/expr_and_series/is_duplicated_test.py b/tests/expr_and_series/is_duplicated_test.py index 7859aed02..d0c5ae3dc 100644 --- a/tests/expr_and_series/is_duplicated_test.py +++ b/tests/expr_and_series/is_duplicated_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": [1, 1, 2], "b": [1, 2, 3], "index": [0, 1, 2]} @@ -14,7 +13,7 @@ def test_is_duplicated_expr(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_is_duplicated_series(constructor_eager: Any) -> None: +def test_is_duplicated_series(constructor_eager: ConstructorEager) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)["a"] result = series.is_duplicated() expected = {"a": [True, True, False]} diff --git a/tests/expr_and_series/is_first_distinct_test.py b/tests/expr_and_series/is_first_distinct_test.py index 93ffc5d37..4f22d02f9 100644 --- a/tests/expr_and_series/is_first_distinct_test.py +++ b/tests/expr_and_series/is_first_distinct_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -20,7 +19,7 @@ def test_is_first_distinct_expr(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_is_first_distinct_series(constructor_eager: Any) -> None: +def test_is_first_distinct_series(constructor_eager: ConstructorEager) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)["a"] result = series.is_first_distinct() expected = { diff --git a/tests/expr_and_series/is_in_test.py b/tests/expr_and_series/is_in_test.py index 085b1efbe..29d3cf56b 100644 --- a/tests/expr_and_series/is_in_test.py +++ b/tests/expr_and_series/is_in_test.py @@ -1,9 +1,8 @@ -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": [1, 4, 2, 5]} @@ -17,7 +16,7 @@ def test_expr_is_in(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_ser_is_in(constructor_eager: Any) -> None: +def test_ser_is_in(constructor_eager: ConstructorEager) -> None: ser = nw.from_native(constructor_eager(data), eager_only=True)["a"] result = {"a": ser.is_in([4, 5])} expected = {"a": [False, True, False, True]} diff --git a/tests/expr_and_series/is_last_distinct_test.py b/tests/expr_and_series/is_last_distinct_test.py index 00db7f735..e63c161b3 100644 --- a/tests/expr_and_series/is_last_distinct_test.py +++ b/tests/expr_and_series/is_last_distinct_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -20,7 +19,7 @@ def test_is_last_distinct_expr(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_is_last_distinct_series(constructor_eager: Any) -> None: +def test_is_last_distinct_series(constructor_eager: ConstructorEager) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)["a"] result = series.is_last_distinct() expected = { diff --git a/tests/expr_and_series/is_null_test.py b/tests/expr_and_series/is_null_test.py index 85ba55dc4..a3d5d2bae 100644 --- a/tests/expr_and_series/is_null_test.py +++ b/tests/expr_and_series/is_null_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -14,7 +13,7 @@ def test_null(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_null_series(constructor_eager: Any) -> None: +def test_null_series(constructor_eager: ConstructorEager) -> None: data_na = {"a": [None, 3, 2], "z": [7.0, None, None]} expected = {"a": [True, False, False], "z": [True, False, False]} df = nw.from_native(constructor_eager(data_na), eager_only=True) diff --git a/tests/expr_and_series/is_unique_test.py b/tests/expr_and_series/is_unique_test.py index b10f7a68f..8d46db92d 100644 --- a/tests/expr_and_series/is_unique_test.py +++ b/tests/expr_and_series/is_unique_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -22,7 +21,7 @@ def test_is_unique_expr(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_is_unique_series(constructor_eager: Any) -> None: +def test_is_unique_series(constructor_eager: ConstructorEager) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)["a"] result = series.is_unique() expected = { diff --git a/tests/expr_and_series/len_test.py b/tests/expr_and_series/len_test.py index b1e1674bf..535c7dc92 100644 --- a/tests/expr_and_series/len_test.py +++ b/tests/expr_and_series/len_test.py @@ -1,9 +1,8 @@ -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -46,7 +45,7 @@ def test_namespace_len(constructor: Constructor) -> None: compare_dicts(df, expected) -def test_len_series(constructor_eager: Any) -> None: +def test_len_series(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 2, 1]} s = nw.from_native(constructor_eager(data), eager_only=True)["a"] diff --git a/tests/expr_and_series/max_test.py b/tests/expr_and_series/max_test.py index 1ea32531e..dcacc7d2e 100644 --- a/tests/expr_and_series/max_test.py +++ b/tests/expr_and_series/max_test.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} @@ -20,7 +19,9 @@ def test_expr_max_expr(constructor: Constructor, expr: nw.Expr) -> None: @pytest.mark.parametrize(("col", "expected"), [("a", 3), ("b", 6), ("z", 9.0)]) -def test_expr_max_series(constructor_eager: Any, col: str, expected: float) -> None: +def test_expr_max_series( + constructor_eager: ConstructorEager, col: str, expected: float +) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)[col] result = series.max() compare_dicts({col: [result]}, {col: [expected]}) diff --git a/tests/expr_and_series/mean_test.py b/tests/expr_and_series/mean_test.py index 50e6fd862..0d381286a 100644 --- a/tests/expr_and_series/mean_test.py +++ b/tests/expr_and_series/mean_test.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": [1, 3, 2], "b": [4, 4, 7], "z": [7.0, 8, 9]} @@ -20,7 +19,9 @@ def test_expr_mean_expr(constructor: Constructor, expr: nw.Expr) -> None: @pytest.mark.parametrize(("col", "expected"), [("a", 2.0), ("b", 5.0), ("z", 8.0)]) -def test_expr_mean_series(constructor_eager: Any, col: str, expected: float) -> None: +def test_expr_mean_series( + constructor_eager: ConstructorEager, col: str, expected: float +) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)[col] result = series.mean() compare_dicts({col: [result]}, {col: [expected]}) diff --git a/tests/expr_and_series/min_test.py b/tests/expr_and_series/min_test.py index f6e98e416..afd659df1 100644 --- a/tests/expr_and_series/min_test.py +++ b/tests/expr_and_series/min_test.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} @@ -20,7 +19,9 @@ def test_expr_min_expr(constructor: Constructor, expr: nw.Expr) -> None: @pytest.mark.parametrize(("col", "expected"), [("a", 1), ("b", 4), ("z", 7.0)]) -def test_expr_min_series(constructor_eager: Any, col: str, expected: float) -> None: +def test_expr_min_series( + constructor_eager: ConstructorEager, col: str, expected: float +) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)[col] result = series.min() compare_dicts({col: [result]}, {col: [expected]}) diff --git a/tests/expr_and_series/mode_test.py b/tests/expr_and_series/mode_test.py index 8e39405af..820e05ad8 100644 --- a/tests/expr_and_series/mode_test.py +++ b/tests/expr_and_series/mode_test.py @@ -1,11 +1,10 @@ -from typing import Any - import polars as pl import pytest import narwhals.stable.v1 as nw from narwhals.utils import parse_version from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -39,7 +38,7 @@ def test_mode_multi_expr( compare_dicts(result, expected) -def test_mode_series(constructor_eager: Any) -> None: +def test_mode_series(constructor_eager: ConstructorEager) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)["a"] result = series.mode().sort() expected = {"a": [1, 2]} diff --git a/tests/expr_and_series/n_unique_test.py b/tests/expr_and_series/n_unique_test.py index 3790bb1f3..c4199eec1 100644 --- a/tests/expr_and_series/n_unique_test.py +++ b/tests/expr_and_series/n_unique_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -17,7 +16,7 @@ def test_n_unique(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_n_unique_series(constructor_eager: Any) -> None: +def test_n_unique_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) expected = {"a": [3], "b": [4]} result_series = {"a": [df["a"].n_unique()], "b": [df["b"].n_unique()]} diff --git a/tests/expr_and_series/null_count_test.py b/tests/expr_and_series/null_count_test.py index 6be15ab32..93d467cb3 100644 --- a/tests/expr_and_series/null_count_test.py +++ b/tests/expr_and_series/null_count_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -20,7 +19,7 @@ def test_null_count_expr(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_null_count_series(constructor_eager: Any) -> None: +def test_null_count_series(constructor_eager: ConstructorEager) -> None: data = [1, 2, None] series = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] result = series.null_count() diff --git a/tests/expr_and_series/operators_test.py b/tests/expr_and_series/operators_test.py index e3f39465c..b4c3677ef 100644 --- a/tests/expr_and_series/operators_test.py +++ b/tests/expr_and_series/operators_test.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -78,7 +77,7 @@ def test_logic_operators_expr( ], ) def test_comparand_operators_scalar_series( - constructor_eager: Any, operator: str, expected: list[bool] + constructor_eager: ConstructorEager, operator: str, expected: list[bool] ) -> None: data = {"a": [0, 1, 2]} s = nw.from_native(constructor_eager(data), eager_only=True)["a"] @@ -98,7 +97,7 @@ def test_comparand_operators_scalar_series( ], ) def test_comparand_operators_series( - constructor_eager: Any, operator: str, expected: list[bool] + constructor_eager: ConstructorEager, operator: str, expected: list[bool] ) -> None: data = {"a": [0, 1, 1], "b": [0, 0, 2]} df = nw.from_native(constructor_eager(data), eager_only=True) @@ -115,7 +114,7 @@ def test_comparand_operators_series( ], ) def test_logic_operators_series( - constructor_eager: Any, operator: str, expected: list[bool] + constructor_eager: ConstructorEager, operator: str, expected: list[bool] ) -> None: data = {"a": [True, True, False, False], "b": [True, False, True, False]} df = nw.from_native(constructor_eager(data), eager_only=True) diff --git a/tests/expr_and_series/pipe_test.py b/tests/expr_and_series/pipe_test.py index 2134a931b..84b6006d7 100644 --- a/tests/expr_and_series/pipe_test.py +++ b/tests/expr_and_series/pipe_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts input_list = {"a": [2, 4, 6, 8]} @@ -15,7 +14,7 @@ def test_pipe_expr(constructor: Constructor) -> None: def test_pipe_series( - constructor_eager: Any, + constructor_eager: ConstructorEager, ) -> None: s = nw.from_native(constructor_eager(input_list), eager_only=True)["a"] result = s.pipe(lambda x: x**2) diff --git a/tests/expr_and_series/quantile_test.py b/tests/expr_and_series/quantile_test.py index aae2b3647..4fd5fa3f4 100644 --- a/tests/expr_and_series/quantile_test.py +++ b/tests/expr_and_series/quantile_test.py @@ -1,13 +1,13 @@ from __future__ import annotations from contextlib import nullcontext as does_not_raise -from typing import Any from typing import Literal import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -62,7 +62,7 @@ def test_quantile_expr( ) @pytest.mark.filterwarnings("ignore:the `interpolation=` argument to percentile") def test_quantile_series( - constructor_eager: Any, + constructor_eager: ConstructorEager, interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"], expected: float, ) -> None: diff --git a/tests/expr_and_series/replace_time_zone_test.py b/tests/expr_and_series/replace_time_zone_test.py index 4954875f0..1c029a478 100644 --- a/tests/expr_and_series/replace_time_zone_test.py +++ b/tests/expr_and_series/replace_time_zone_test.py @@ -1,6 +1,8 @@ +from __future__ import annotations + from datetime import datetime from datetime import timezone -from typing import Any +from typing import TYPE_CHECKING import pandas as pd import pyarrow as pa @@ -12,6 +14,9 @@ from tests.utils import compare_dicts from tests.utils import is_windows +if TYPE_CHECKING: + from tests.utils import ConstructorEager + def test_replace_time_zone( constructor: Constructor, request: pytest.FixtureRequest @@ -65,7 +70,7 @@ def test_replace_time_zone_none( def test_replace_time_zone_series( - constructor_eager: Any, request: pytest.FixtureRequest + constructor_eager: ConstructorEager, request: pytest.FixtureRequest ) -> None: if ( (any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows()) @@ -97,7 +102,7 @@ def test_replace_time_zone_series( def test_replace_time_zone_none_series( - constructor_eager: Any, request: pytest.FixtureRequest + constructor_eager: ConstructorEager, request: pytest.FixtureRequest ) -> None: if ( (any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows()) diff --git a/tests/expr_and_series/round_test.py b/tests/expr_and_series/round_test.py index 37d6ce131..613a82afe 100644 --- a/tests/expr_and_series/round_test.py +++ b/tests/expr_and_series/round_test.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -21,7 +20,7 @@ def test_round(constructor: Constructor, decimals: int) -> None: @pytest.mark.parametrize("decimals", [0, 1, 2]) -def test_round_series(constructor_eager: Any, decimals: int) -> None: +def test_round_series(constructor_eager: ConstructorEager, decimals: int) -> None: data = {"a": [1.12345, 2.56789, 3.901234]} df_raw = constructor_eager(data) df = nw.from_native(df_raw, eager_only=True) diff --git a/tests/expr_and_series/shift_test.py b/tests/expr_and_series/shift_test.py index b165adf12..a665ff768 100644 --- a/tests/expr_and_series/shift_test.py +++ b/tests/expr_and_series/shift_test.py @@ -1,9 +1,8 @@ -from typing import Any - import pyarrow as pa import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -26,7 +25,7 @@ def test_shift(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_shift_series(constructor_eager: Any) -> None: +def test_shift_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) result = df.with_columns( df["a"].shift(2), diff --git a/tests/expr_and_series/sort_test.py b/tests/expr_and_series/sort_test.py index f06e21f74..2ea8cd145 100644 --- a/tests/expr_and_series/sort_test.py +++ b/tests/expr_and_series/sort_test.py @@ -3,6 +3,8 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager +from tests.utils import compare_dicts data = {"a": [0, 0, 2, -1], "b": [1, 3, 2, None]} @@ -17,16 +19,14 @@ ], ) def test_sort_expr( - constructor_eager: Any, descending: Any, nulls_last: Any, expected: Any + constructor_eager: ConstructorEager, descending: Any, nulls_last: Any, expected: Any ) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) - result = nw.to_native( - df.select( - "a", - nw.col("b").sort(descending=descending, nulls_last=nulls_last), - ) + result = df.select( + "a", + nw.col("b").sort(descending=descending, nulls_last=nulls_last), ) - assert result.equals(constructor_eager(expected)) + compare_dicts(result, expected) @pytest.mark.parametrize( @@ -39,7 +39,7 @@ def test_sort_expr( ], ) def test_sort_series( - constructor_eager: Any, descending: Any, nulls_last: Any, expected: Any + constructor_eager: ConstructorEager, descending: Any, nulls_last: Any, expected: Any ) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)["b"] result = series.sort(descending=descending, nulls_last=nulls_last) diff --git a/tests/expr_and_series/std_test.py b/tests/expr_and_series/std_test.py index 400a6e0af..09779c109 100644 --- a/tests/expr_and_series/std_test.py +++ b/tests/expr_and_series/std_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} @@ -26,7 +25,7 @@ def test_std(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_std_series(constructor_eager: Any) -> None: +def test_std_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) result = { "a_ddof_default": [df["a"].std()], diff --git a/tests/expr_and_series/str/contains_test.py b/tests/expr_and_series/str/contains_test.py index 6b9e74b69..139b71eb8 100644 --- a/tests/expr_and_series/str/contains_test.py +++ b/tests/expr_and_series/str/contains_test.py @@ -1,11 +1,10 @@ -from typing import Any - import pandas as pd import polars as pl import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"pets": ["cat", "dog", "rabbit and parrot", "dove"]} @@ -32,7 +31,7 @@ def test_contains_case_insensitive( def test_contains_series_case_insensitive( - constructor_eager: Any, request: pytest.FixtureRequest + constructor_eager: ConstructorEager, request: pytest.FixtureRequest ) -> None: if "cudf" in str(constructor_eager): request.applymarker(pytest.mark.xfail) @@ -58,7 +57,7 @@ def test_contains_case_sensitive(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_contains_series_case_sensitive(constructor_eager: Any) -> None: +def test_contains_series_case_sensitive(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) result = df.with_columns(case_sensitive_match=df["pets"].str.contains("parrot|Dove")) expected = { diff --git a/tests/expr_and_series/str/head_test.py b/tests/expr_and_series/str/head_test.py index a4b3e7296..8da64553e 100644 --- a/tests/expr_and_series/str/head_test.py +++ b/tests/expr_and_series/str/head_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": ["foo", "bars"]} @@ -16,7 +15,7 @@ def test_str_head(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_str_head_series(constructor_eager: Any) -> None: +def test_str_head_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) expected = { "a": ["foo", "bar"], diff --git a/tests/expr_and_series/str/len_chars_test.py b/tests/expr_and_series/str/len_chars_test.py index ace145552..80a791c61 100644 --- a/tests/expr_and_series/str/len_chars_test.py +++ b/tests/expr_and_series/str/len_chars_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": ["foo", "foobar", "Café", "345", "東京"]} @@ -16,7 +15,7 @@ def test_str_len_chars(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_str_len_chars_series(constructor_eager: Any) -> None: +def test_str_len_chars_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) expected = { "a": [3, 6, 4, 3, 2], diff --git a/tests/expr_and_series/str/replace_test.py b/tests/expr_and_series/str/replace_test.py index b0cffb1b4..8db24c91e 100644 --- a/tests/expr_and_series/str/replace_test.py +++ b/tests/expr_and_series/str/replace_test.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts replace_data = [ @@ -54,7 +53,7 @@ replace_data, ) def test_str_replace_series( - constructor_eager: Any, + constructor_eager: ConstructorEager, data: dict[str, list[str]], pattern: str, value: str, @@ -75,7 +74,7 @@ def test_str_replace_series( replace_all_data, ) def test_str_replace_all_series( - constructor_eager: Any, + constructor_eager: ConstructorEager, data: dict[str, list[str]], pattern: str, value: str, diff --git a/tests/expr_and_series/str/slice_test.py b/tests/expr_and_series/str/slice_test.py index e7fe0efa1..3b7bb90ce 100644 --- a/tests/expr_and_series/str/slice_test.py +++ b/tests/expr_and_series/str/slice_test.py @@ -6,6 +6,7 @@ import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": ["fdas", "edfas"]} @@ -28,7 +29,7 @@ def test_str_slice( [(1, 2, {"a": ["da", "df"]}), (-2, None, {"a": ["as", "as"]})], ) def test_str_slice_series( - constructor_eager: Any, offset: int, length: int | None, expected: Any + constructor_eager: ConstructorEager, offset: int, length: int | None, expected: Any ) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) diff --git a/tests/expr_and_series/str/starts_with_ends_with_test.py b/tests/expr_and_series/str/starts_with_ends_with_test.py index e8b0afaa9..3682c4182 100644 --- a/tests/expr_and_series/str/starts_with_ends_with_test.py +++ b/tests/expr_and_series/str/starts_with_ends_with_test.py @@ -1,9 +1,8 @@ from __future__ import annotations -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager # Don't move this into typechecking block, for coverage # purposes @@ -21,7 +20,7 @@ def test_ends_with(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_ends_with_series(constructor_eager: Any) -> None: +def test_ends_with_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) result = df.select(df["a"].str.ends_with("das")) expected = { @@ -39,7 +38,7 @@ def test_starts_with(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_starts_with_series(constructor_eager: Any) -> None: +def test_starts_with_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) result = df.select(df["a"].str.starts_with("fda")) expected = { diff --git a/tests/expr_and_series/str/strip_chars_test.py b/tests/expr_and_series/str/strip_chars_test.py index 3d5b74456..66b9cda0d 100644 --- a/tests/expr_and_series/str/strip_chars_test.py +++ b/tests/expr_and_series/str/strip_chars_test.py @@ -6,6 +6,7 @@ import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": ["foobar", "bar\n", " baz"]} @@ -34,7 +35,7 @@ def test_str_strip_chars( ], ) def test_str_strip_chars_series( - constructor_eager: Any, characters: str | None, expected: Any + constructor_eager: ConstructorEager, characters: str | None, expected: Any ) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) diff --git a/tests/expr_and_series/str/tail_test.py b/tests/expr_and_series/str/tail_test.py index 92d474262..260ab745c 100644 --- a/tests/expr_and_series/str/tail_test.py +++ b/tests/expr_and_series/str/tail_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": ["foo", "bars"]} @@ -15,7 +14,7 @@ def test_str_tail(constructor: Constructor) -> None: compare_dicts(result_frame, expected) -def test_str_tail_series(constructor_eager: Any) -> None: +def test_str_tail_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) expected = {"a": ["foo", "ars"]} diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py index 8474357e0..62afda474 100644 --- a/tests/expr_and_series/str/to_datetime_test.py +++ b/tests/expr_and_series/str/to_datetime_test.py @@ -1,10 +1,14 @@ -from typing import Any +from __future__ import annotations + +from typing import TYPE_CHECKING import pytest import narwhals.stable.v1 as nw -from tests.utils import Constructor +if TYPE_CHECKING: + from tests.utils import Constructor + from tests.utils import ConstructorEager data = {"a": ["2020-01-01T12:34:56"]} @@ -24,7 +28,7 @@ def test_to_datetime(constructor: Constructor) -> None: assert str(result) == expected -def test_to_datetime_series(constructor_eager: Any) -> None: +def test_to_datetime_series(constructor_eager: ConstructorEager) -> None: if "cudf" in str(constructor_eager): # pragma: no cover expected = "2020-01-01T12:34:56.000000000" else: @@ -60,7 +64,7 @@ def test_to_datetime_infer_fmt( def test_to_datetime_series_infer_fmt( - request: pytest.FixtureRequest, constructor_eager: Any + request: pytest.FixtureRequest, constructor_eager: ConstructorEager ) -> None: if "pyarrow_table" in str(constructor_eager): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py index 877409138..e5b5832f6 100644 --- a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py +++ b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py @@ -1,13 +1,12 @@ from __future__ import annotations -from typing import Any - import pyarrow as pa import pytest import narwhals.stable.v1 as nw from narwhals.utils import parse_version from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -68,7 +67,7 @@ def test_str_to_uppercase( ], ) def test_str_to_uppercase_series( - constructor_eager: Any, + constructor_eager: ConstructorEager, data: dict[str, list[str]], expected: dict[str, list[str]], request: pytest.FixtureRequest, @@ -134,7 +133,7 @@ def test_str_to_lowercase( ], ) def test_str_to_lowercase_series( - constructor_eager: Any, + constructor_eager: ConstructorEager, data: dict[str, list[str]], expected: dict[str, list[str]], ) -> None: diff --git a/tests/expr_and_series/sum_test.py b/tests/expr_and_series/sum_test.py index 8059a097d..914d902f3 100644 --- a/tests/expr_and_series/sum_test.py +++ b/tests/expr_and_series/sum_test.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} @@ -20,7 +19,9 @@ def test_expr_sum_expr(constructor: Constructor, expr: nw.Expr) -> None: @pytest.mark.parametrize(("col", "expected"), [("a", 6), ("b", 14), ("z", 24.0)]) -def test_expr_sum_series(constructor_eager: Any, col: str, expected: float) -> None: +def test_expr_sum_series( + constructor_eager: ConstructorEager, col: str, expected: float +) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)[col] result = series.sum() compare_dicts({col: [result]}, {col: [expected]}) diff --git a/tests/expr_and_series/tail_test.py b/tests/expr_and_series/tail_test.py index fc3e6159a..73acb6848 100644 --- a/tests/expr_and_series/tail_test.py +++ b/tests/expr_and_series/tail_test.py @@ -1,9 +1,8 @@ -from typing import Any - import pytest import narwhals as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -20,7 +19,7 @@ def test_head(constructor: Constructor, n: int, request: pytest.FixtureRequest) @pytest.mark.parametrize("n", [2, -1]) -def test_head_series(constructor_eager: Any, n: int) -> None: +def test_head_series(constructor_eager: ConstructorEager, n: int) -> None: df = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True) result = df.select(df["a"].tail(n)) expected = {"a": [2, 3]} diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index 66afd22af..c1e1d007b 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -1,7 +1,6 @@ -from typing import Any - import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -29,7 +28,7 @@ def test_unary(constructor: Constructor) -> None: compare_dicts(result, expected) -def test_unary_series(constructor_eager: Any) -> None: +def test_unary_series(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor_eager(data), eager_only=True) result = { diff --git a/tests/expr_and_series/unique_test.py b/tests/expr_and_series/unique_test.py index 5639179ba..5048d3250 100644 --- a/tests/expr_and_series/unique_test.py +++ b/tests/expr_and_series/unique_test.py @@ -1,9 +1,8 @@ -from typing import Any - import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": [1, 1, 2]} @@ -18,7 +17,7 @@ def test_unique_expr(constructor: Constructor, request: pytest.FixtureRequest) - compare_dicts(result, expected) -def test_unique_series(constructor_eager: Any) -> None: +def test_unique_series(constructor_eager: ConstructorEager) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)["a"] result = series.unique() expected = {"a": [1, 2]} diff --git a/tests/expr_and_series/when_test.py b/tests/expr_and_series/when_test.py index 6fabaa68b..eb1ac9c41 100644 --- a/tests/expr_and_series/when_test.py +++ b/tests/expr_and_series/when_test.py @@ -1,12 +1,11 @@ from __future__ import annotations -from typing import Any - import numpy as np import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -71,7 +70,7 @@ def test_value_numpy_array( compare_dicts(result, expected) -def test_value_series(constructor_eager: Any) -> None: +def test_value_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data)) s_data = {"s": [3, 4, 5]} s = nw.from_native(constructor_eager(s_data))["s"] @@ -110,7 +109,7 @@ def test_otherwise_numpy_array( compare_dicts(result, expected) -def test_otherwise_series(constructor_eager: Any) -> None: +def test_otherwise_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data)) s_data = {"s": [0, 9, 10]} s = nw.from_native(constructor_eager(s_data))["s"] diff --git a/tests/frame/array_dunder_test.py b/tests/frame/array_dunder_test.py index 8a082bb1f..ad3085f56 100644 --- a/tests/frame/array_dunder_test.py +++ b/tests/frame/array_dunder_test.py @@ -1,5 +1,3 @@ -from typing import Any - import numpy as np import pandas as pd import polars as pl @@ -8,10 +6,13 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +from tests.utils import ConstructorEager from tests.utils import compare_dicts -def test_array_dunder(request: pytest.FixtureRequest, constructor_eager: Any) -> None: +def test_array_dunder( + request: pytest.FixtureRequest, constructor_eager: ConstructorEager +) -> None: if "pyarrow_table" in str(constructor_eager) and parse_version( pa.__version__ ) < parse_version("16.0.0"): # pragma: no cover @@ -23,7 +24,7 @@ def test_array_dunder(request: pytest.FixtureRequest, constructor_eager: Any) -> def test_array_dunder_with_dtype( - request: pytest.FixtureRequest, constructor_eager: Any + request: pytest.FixtureRequest, constructor_eager: ConstructorEager ) -> None: if "pyarrow_table" in str(constructor_eager) and parse_version( pa.__version__ @@ -36,7 +37,7 @@ def test_array_dunder_with_dtype( def test_array_dunder_with_copy( - request: pytest.FixtureRequest, constructor_eager: Any + request: pytest.FixtureRequest, constructor_eager: ConstructorEager ) -> None: if "pyarrow_table" in str(constructor_eager) and parse_version(pa.__version__) < ( 16, diff --git a/tests/frame/get_column_test.py b/tests/frame/get_column_test.py index 58766ac31..ff4ebc506 100644 --- a/tests/frame/get_column_test.py +++ b/tests/frame/get_column_test.py @@ -1,13 +1,12 @@ -from typing import Any - import pandas as pd import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts -def test_get_column(constructor_eager: Any) -> None: +def test_get_column(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager({"a": [1, 2], "b": [3, 4]}), eager_only=True) result = df.get_column("a") compare_dicts({"a": result}, {"a": [1, 2]}) diff --git a/tests/frame/getitem_test.py b/tests/frame/getitem_test.py index ce96c1b24..a5397c7ef 100644 --- a/tests/frame/getitem_test.py +++ b/tests/frame/getitem_test.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Any - import numpy as np import pandas as pd import polars as pl @@ -9,6 +7,7 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = { @@ -17,13 +16,13 @@ } -def test_slice_column(constructor_eager: Any) -> None: +def test_slice_column(constructor_eager: ConstructorEager) -> None: result = nw.from_native(constructor_eager(data))["a"] assert isinstance(result, nw.Series) compare_dicts({"a": result}, {"a": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) -def test_slice_rows(constructor_eager: Any) -> None: +def test_slice_rows(constructor_eager: ConstructorEager) -> None: result = nw.from_native(constructor_eager(data))[1:] compare_dicts(result, {"a": [2.0, 3.0, 4.0, 5.0, 6.0], "b": [12, 13, 14, 15, 16]}) @@ -32,7 +31,7 @@ def test_slice_rows(constructor_eager: Any) -> None: def test_slice_rows_with_step( - request: pytest.FixtureRequest, constructor_eager: Any + request: pytest.FixtureRequest, constructor_eager: ConstructorEager ) -> None: if "pyarrow_table" in str(constructor_eager): request.applymarker(pytest.mark.xfail) @@ -53,19 +52,19 @@ def test_slice_lazy_fails() -> None: _ = nw.from_native(pl.LazyFrame(data))[1:] -def test_slice_int(constructor_eager: Any) -> None: +def test_slice_int(constructor_eager: ConstructorEager) -> None: result = nw.from_native(constructor_eager(data), eager_only=True)[1] # type: ignore[call-overload] compare_dicts(result, {"a": [2], "b": [12]}) -def test_slice_fails(constructor_eager: Any) -> None: +def test_slice_fails(constructor_eager: ConstructorEager) -> None: class Foo: ... with pytest.raises(TypeError, match="Expected str or slice, got:"): nw.from_native(constructor_eager(data), eager_only=True)[Foo()] # type: ignore[call-overload] -def test_gather(constructor_eager: Any) -> None: +def test_gather(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) result = df[[0, 3, 1]] expected = { @@ -89,7 +88,7 @@ def test_gather_pandas_index() -> None: compare_dicts(result, expected) -def test_gather_rows_cols(constructor_eager: Any) -> None: +def test_gather_rows_cols(constructor_eager: ConstructorEager) -> None: native_df = constructor_eager(data) df = nw.from_native(native_df, eager_only=True) @@ -102,7 +101,7 @@ def test_gather_rows_cols(constructor_eager: Any) -> None: compare_dicts(result, expected) -def test_slice_both_tuples_of_ints(constructor_eager: Any) -> None: +def test_slice_both_tuples_of_ints(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} df = nw.from_native(constructor_eager(data), eager_only=True) result = df[[0, 1], [0, 2]] @@ -110,7 +109,7 @@ def test_slice_both_tuples_of_ints(constructor_eager: Any) -> None: compare_dicts(result, expected) -def test_slice_int_rows_str_columns(constructor_eager: Any) -> None: +def test_slice_int_rows_str_columns(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} df = nw.from_native(constructor_eager(data), eager_only=True) result = df[[0, 1], ["a", "c"]] @@ -118,7 +117,7 @@ def test_slice_int_rows_str_columns(constructor_eager: Any) -> None: compare_dicts(result, expected) -def test_slice_slice_columns(constructor_eager: Any) -> None: # noqa: PLR0915 +def test_slice_slice_columns(constructor_eager: ConstructorEager) -> None: # noqa: PLR0915 data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9], "d": [1, 4, 2]} df = nw.from_native(constructor_eager(data), eager_only=True) result = df[[0, 1], "b":"c"] # type: ignore[misc] @@ -188,14 +187,14 @@ def test_slice_slice_columns(constructor_eager: Any) -> None: # noqa: PLR0915 compare_dicts(result, expected) -def test_slice_invalid(constructor_eager: Any) -> None: +def test_slice_invalid(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 2], "b": [4, 5]} df = nw.from_native(constructor_eager(data), eager_only=True) with pytest.raises(TypeError, match="Hint:"): df[0, 0] -def test_slice_edge_cases(constructor_eager: Any) -> None: +def test_slice_edge_cases(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9], "d": [1, 4, 2]} df = nw.from_native(constructor_eager(data), eager_only=True) assert df[[], :].shape == (0, 4) @@ -219,7 +218,7 @@ def test_slice_edge_cases(constructor_eager: Any) -> None: ], ) def test_get_item_works_with_tuple_and_list_and_range_row_and_col_indexing( - constructor_eager: Any, + constructor_eager: ConstructorEager, row_idx: list[int] | tuple[int] | range, col_idx: list[int] | tuple[int] | range, ) -> None: @@ -236,7 +235,7 @@ def test_get_item_works_with_tuple_and_list_and_range_row_and_col_indexing( ], ) def test_get_item_works_with_tuple_and_list_and_range_row_indexing_and_slice_col_indexing( - constructor_eager: Any, + constructor_eager: ConstructorEager, row_idx: list[int] | tuple[int] | range, col: slice, ) -> None: @@ -253,7 +252,7 @@ def test_get_item_works_with_tuple_and_list_and_range_row_indexing_and_slice_col ], ) def test_get_item_works_with_tuple_and_list_indexing_and_str( - constructor_eager: Any, + constructor_eager: ConstructorEager, row_idx: list[int] | tuple[int] | range, col: str, ) -> None: diff --git a/tests/frame/is_duplicated_test.py b/tests/frame/is_duplicated_test.py index e1eb3f298..a4dbd97aa 100644 --- a/tests/frame/is_duplicated_test.py +++ b/tests/frame/is_duplicated_test.py @@ -1,12 +1,11 @@ from __future__ import annotations -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts -def test_is_duplicated(constructor_eager: Any) -> None: +def test_is_duplicated(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor_eager(data) df = nw.from_native(df_raw, eager_only=True) diff --git a/tests/frame/is_empty_test.py b/tests/frame/is_empty_test.py index a772abc8b..7ea6b22ad 100644 --- a/tests/frame/is_empty_test.py +++ b/tests/frame/is_empty_test.py @@ -1,14 +1,20 @@ from __future__ import annotations +from typing import TYPE_CHECKING from typing import Any import pytest import narwhals.stable.v1 as nw +if TYPE_CHECKING: + from tests.utils import ConstructorEager + @pytest.mark.parametrize(("threshold", "expected"), [(0, False), (10, True)]) -def test_is_empty(constructor_eager: Any, threshold: Any, expected: Any) -> None: +def test_is_empty( + constructor_eager: ConstructorEager, threshold: Any, expected: Any +) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor_eager(data) df = nw.from_native(df_raw, eager_only=True) diff --git a/tests/frame/is_unique_test.py b/tests/frame/is_unique_test.py index 4259c8773..cb9d57ba2 100644 --- a/tests/frame/is_unique_test.py +++ b/tests/frame/is_unique_test.py @@ -1,12 +1,11 @@ from __future__ import annotations -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts -def test_is_unique(constructor_eager: Any) -> None: +def test_is_unique(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor_eager(data) df = nw.from_native(df_raw, eager_only=True) diff --git a/tests/frame/item_test.py b/tests/frame/item_test.py index 7afbee12d..4453ea611 100644 --- a/tests/frame/item_test.py +++ b/tests/frame/item_test.py @@ -6,6 +6,7 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -14,7 +15,10 @@ [(0, 2, 7), (1, "z", 8)], ) def test_item( - constructor_eager: Any, row: int | None, column: int | str | None, expected: Any + constructor_eager: ConstructorEager, + row: int | None, + column: int | str | None, + expected: Any, ) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor_eager(data), eager_only=True) @@ -43,7 +47,10 @@ def test_item( ], ) def test_item_value_error( - constructor_eager: Any, row: int | None, column: int | str | None, err_msg: str + constructor_eager: ConstructorEager, + row: int | None, + column: int | str | None, + err_msg: str, ) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} with pytest.raises(ValueError, match=err_msg): diff --git a/tests/frame/lazy_test.py b/tests/frame/lazy_test.py index 09ca734c2..8f1566e69 100644 --- a/tests/frame/lazy_test.py +++ b/tests/frame/lazy_test.py @@ -1,10 +1,9 @@ -from typing import Any - import narwhals as nw import narwhals.stable.v1 as nw_v1 +from tests.utils import ConstructorEager -def test_lazy(constructor_eager: Any) -> None: +def test_lazy(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True) result = df.lazy() assert isinstance(result, nw.LazyFrame) diff --git a/tests/frame/len_test.py b/tests/frame/len_test.py index c06884e03..cd082ef2e 100644 --- a/tests/frame/len_test.py +++ b/tests/frame/len_test.py @@ -1,6 +1,5 @@ -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager data = { "a": [1.0, 2.0, None, 4.0], @@ -8,6 +7,7 @@ } -def test_len(constructor_eager: Any) -> None: +def test_len(constructor_eager: ConstructorEager) -> None: result = len(nw.from_native(constructor_eager(data), eager_only=True)) + assert result == 4 diff --git a/tests/frame/null_count_test.py b/tests/frame/null_count_test.py index d3bf7f25c..71ac965f8 100644 --- a/tests/frame/null_count_test.py +++ b/tests/frame/null_count_test.py @@ -1,12 +1,11 @@ from __future__ import annotations -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts -def test_null_count(constructor_eager: Any) -> None: +def test_null_count(constructor_eager: ConstructorEager) -> None: data = {"a": [None, 3, 2], "b": [4, 4, 6], "z": [7.0, None, 9]} df_raw = constructor_eager(data) df = nw.from_native(df_raw, eager_only=True) diff --git a/tests/frame/row_test.py b/tests/frame/row_test.py index 599dcaeaf..d977a81f1 100644 --- a/tests/frame/row_test.py +++ b/tests/frame/row_test.py @@ -3,9 +3,10 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager -def test_row_column(request: Any, constructor_eager: Any) -> None: +def test_row_column(request: Any, constructor_eager: ConstructorEager) -> None: if "cudf" in str(constructor_eager): request.applymarker(pytest.mark.xfail) diff --git a/tests/frame/rows_test.py b/tests/frame/rows_test.py index 744f66065..60e18658c 100644 --- a/tests/frame/rows_test.py +++ b/tests/frame/rows_test.py @@ -1,5 +1,6 @@ from __future__ import annotations +from typing import TYPE_CHECKING from typing import Any import pandas as pd @@ -10,6 +11,9 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +if TYPE_CHECKING: + from tests.utils import ConstructorEager + df_pandas = pd.DataFrame({"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}) df_pa = pa.table({"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}) if parse_version(pd.__version__) >= parse_version("1.5.0"): @@ -56,7 +60,7 @@ ) def test_iter_rows( request: Any, - constructor_eager: Any, + constructor_eager: ConstructorEager, named: bool, # noqa: FBT001 expected: list[tuple[Any, ...]] | list[dict[str, Any]], ) -> None: diff --git a/tests/frame/schema_test.py b/tests/frame/schema_test.py index 8e1116997..97c3722a7 100644 --- a/tests/frame/schema_test.py +++ b/tests/frame/schema_test.py @@ -12,6 +12,7 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version from tests.utils import Constructor +from tests.utils import ConstructorEager data = { "a": [datetime(2020, 1, 1)], @@ -60,7 +61,9 @@ def test_string_disguised_as_object() -> None: assert result["a"] == nw.String -def test_actual_object(request: pytest.FixtureRequest, constructor_eager: Any) -> None: +def test_actual_object( + request: pytest.FixtureRequest, constructor_eager: ConstructorEager +) -> None: if any(x in str(constructor_eager) for x in ("modin", "pyarrow_table", "cudf")): request.applymarker(pytest.mark.xfail) diff --git a/tests/frame/shape_test.py b/tests/frame/shape_test.py index 2ab3a23bc..6930214f7 100644 --- a/tests/frame/shape_test.py +++ b/tests/frame/shape_test.py @@ -1,9 +1,8 @@ -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager -def test_shape(constructor_eager: Any) -> None: +def test_shape(constructor_eager: ConstructorEager) -> None: result = nw.from_native( constructor_eager({"a": [1, 2], "b": [4, 5], "c": [7, 8]}), eager_only=True ).shape diff --git a/tests/frame/to_arrow_test.py b/tests/frame/to_arrow_test.py index f20bdf28c..373f6310b 100644 --- a/tests/frame/to_arrow_test.py +++ b/tests/frame/to_arrow_test.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING import pandas as pd import pyarrow as pa @@ -9,8 +9,13 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +if TYPE_CHECKING: + from tests.utils import ConstructorEager -def test_to_arrow(request: pytest.FixtureRequest, constructor_eager: Any) -> None: + +def test_to_arrow( + request: pytest.FixtureRequest, constructor_eager: ConstructorEager +) -> None: if "pandas" in str(constructor_eager) and parse_version(pd.__version__) < (1, 0, 0): # pyarrow requires pandas>=1.0.0 request.applymarker(pytest.mark.xfail) diff --git a/tests/frame/to_dict_test.py b/tests/frame/to_dict_test.py index 29c3d2270..b76003bd1 100644 --- a/tests/frame/to_dict_test.py +++ b/tests/frame/to_dict_test.py @@ -1,22 +1,21 @@ -from typing import Any - import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts @pytest.mark.filterwarnings( "ignore:.*all arguments of to_dict except for the argument:FutureWarning" ) -def test_to_dict(constructor_eager: Any) -> None: +def test_to_dict(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8, 9]} df = nw.from_native(constructor_eager(data), eager_only=True) result = df.to_dict(as_series=False) assert result == data -def test_to_dict_as_series(constructor_eager: Any) -> None: +def test_to_dict_as_series(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8, 9]} df = nw.from_native(constructor_eager(data), eager_only=True) result = df.to_dict(as_series=True) diff --git a/tests/frame/to_native_test.py b/tests/frame/to_native_test.py index d8f4132bf..c6de99a17 100644 --- a/tests/frame/to_native_test.py +++ b/tests/frame/to_native_test.py @@ -1,9 +1,8 @@ -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import Constructor -def test_to_native(constructor: Any) -> None: +def test_to_native(constructor: Constructor) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]} df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/frame/to_numpy_test.py b/tests/frame/to_numpy_test.py index d573f4322..aa3dfc2e4 100644 --- a/tests/frame/to_numpy_test.py +++ b/tests/frame/to_numpy_test.py @@ -1,13 +1,16 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING import numpy as np import narwhals.stable.v1 as nw +if TYPE_CHECKING: + from tests.utils import ConstructorEager -def test_to_numpy(constructor_eager: Any) -> None: + +def test_to_numpy(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]} df_raw = constructor_eager(data) result = nw.from_native(df_raw, eager_only=True).to_numpy() diff --git a/tests/frame/to_pandas_test.py b/tests/frame/to_pandas_test.py index 671a5d857..d6370f02e 100644 --- a/tests/frame/to_pandas_test.py +++ b/tests/frame/to_pandas_test.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING import pandas as pd import pytest @@ -8,13 +8,18 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +if TYPE_CHECKING: + from tests.utils import ConstructorEager + @pytest.mark.filterwarnings("ignore:.*Passing a BlockManager.*:DeprecationWarning") @pytest.mark.skipif( parse_version(pd.__version__) < parse_version("2.0.0"), reason="too old for pandas-pyarrow", ) -def test_convert_pandas(constructor_eager: Any, request: pytest.FixtureRequest) -> None: +def test_convert_pandas( + constructor_eager: ConstructorEager, request: pytest.FixtureRequest +) -> None: if "modin" in str(constructor_eager): request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} diff --git a/tests/frame/write_csv_test.py b/tests/frame/write_csv_test.py index ed9303604..84ce84f0d 100644 --- a/tests/frame/write_csv_test.py +++ b/tests/frame/write_csv_test.py @@ -1,16 +1,18 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Any import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import is_windows if TYPE_CHECKING: import pytest -def test_write_csv(constructor_eager: Any, tmpdir: pytest.TempdirFactory) -> None: +def test_write_csv( + constructor_eager: ConstructorEager, tmpdir: pytest.TempdirFactory +) -> None: data = {"a": [1, 2, 3]} path = tmpdir / "foo.csv" # type: ignore[operator] result = nw.from_native(constructor_eager(data), eager_only=True).write_csv(str(path)) diff --git a/tests/frame/write_parquet_test.py b/tests/frame/write_parquet_test.py index 8efaefb55..c616de198 100644 --- a/tests/frame/write_parquet_test.py +++ b/tests/frame/write_parquet_test.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING import pandas as pd import pytest @@ -8,13 +8,18 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +if TYPE_CHECKING: + from tests.utils import ConstructorEager + data = {"a": [1, 2, 3]} @pytest.mark.skipif( parse_version(pd.__version__) < parse_version("2.0.0"), reason="too old for pyarrow" ) -def test_write_parquet(constructor_eager: Any, tmpdir: pytest.TempdirFactory) -> None: +def test_write_parquet( + constructor_eager: ConstructorEager, tmpdir: pytest.TempdirFactory +) -> None: path = tmpdir / "foo.parquet" # type: ignore[operator] nw.from_native(constructor_eager(data), eager_only=True).write_parquet(str(path)) assert path.exists() diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 90cb48c26..27407a26a 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -1,7 +1,6 @@ from __future__ import annotations from contextlib import nullcontext -from typing import Any import pandas as pd import polars as pl @@ -11,6 +10,7 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = {"a": [1, 1, 3], "b": [4, 4, 6], "c": [7.0, 8, 9]} @@ -73,7 +73,7 @@ def test_invalid_group_by() -> None: ) -def test_group_by_iter(constructor_eager: Any) -> None: +def test_group_by_iter(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) expected_keys = [(1,), (3,)] keys = [] diff --git a/tests/new_series_test.py b/tests/new_series_test.py index fad4a7536..37e5d2633 100644 --- a/tests/new_series_test.py +++ b/tests/new_series_test.py @@ -1,14 +1,13 @@ -from typing import Any - import pandas as pd import pytest import narwhals as nw import narwhals.stable.v1 as nw_v1 +from tests.utils import ConstructorEager from tests.utils import compare_dicts -def test_new_series(constructor_eager: Any) -> None: +def test_new_series(constructor_eager: ConstructorEager) -> None: s = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)["a"] result = nw.new_series("b", [4, 1, 2], native_namespace=nw.get_native_namespace(s)) expected = {"b": [4, 1, 2]} @@ -25,7 +24,7 @@ def test_new_series(constructor_eager: Any) -> None: compare_dicts(result.to_frame(), expected) -def test_new_series_v1(constructor_eager: Any) -> None: +def test_new_series_v1(constructor_eager: ConstructorEager) -> None: s = nw_v1.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)["a"] result = nw_v1.new_series( "b", [4, 1, 2], native_namespace=nw_v1.get_native_namespace(s) diff --git a/tests/series_only/__iter___test.py b/tests/series_only/__iter___test.py index a0a5c1189..06753917b 100644 --- a/tests/series_only/__iter___test.py +++ b/tests/series_only/__iter___test.py @@ -1,17 +1,22 @@ from __future__ import annotations from collections.abc import Iterable -from typing import Any +from typing import TYPE_CHECKING import pytest import narwhals.stable.v1 as nw from tests.utils import compare_dicts +if TYPE_CHECKING: + from tests.utils import ConstructorEager + data = [1, 2, 3] -def test_iter(constructor_eager: Any, request: pytest.FixtureRequest) -> None: +def test_iter( + constructor_eager: ConstructorEager, request: pytest.FixtureRequest +) -> None: if "cudf" in str(constructor_eager): request.applymarker(pytest.mark.xfail) s = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] diff --git a/tests/series_only/array_dunder_test.py b/tests/series_only/array_dunder_test.py index c09bea9ec..0d95e2db3 100644 --- a/tests/series_only/array_dunder_test.py +++ b/tests/series_only/array_dunder_test.py @@ -1,5 +1,3 @@ -from typing import Any - import numpy as np import pandas as pd import pyarrow as pa @@ -7,10 +5,13 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +from tests.utils import ConstructorEager from tests.utils import compare_dicts -def test_array_dunder(request: pytest.FixtureRequest, constructor_eager: Any) -> None: +def test_array_dunder( + request: pytest.FixtureRequest, constructor_eager: ConstructorEager +) -> None: if "pyarrow_table" in str(constructor_eager) and parse_version( pa.__version__ ) < parse_version("16.0.0"): # pragma: no cover @@ -22,7 +23,7 @@ def test_array_dunder(request: pytest.FixtureRequest, constructor_eager: Any) -> def test_array_dunder_with_dtype( - request: pytest.FixtureRequest, constructor_eager: Any + request: pytest.FixtureRequest, constructor_eager: ConstructorEager ) -> None: if "pyarrow_table" in str(constructor_eager) and parse_version( pa.__version__ @@ -35,7 +36,7 @@ def test_array_dunder_with_dtype( def test_array_dunder_with_copy( - request: pytest.FixtureRequest, constructor_eager: Any + request: pytest.FixtureRequest, constructor_eager: ConstructorEager ) -> None: if "pyarrow_table" in str(constructor_eager) and parse_version( pa.__version__ diff --git a/tests/series_only/dtype_test.py b/tests/series_only/dtype_test.py index 68d10fbca..8200150f0 100644 --- a/tests/series_only/dtype_test.py +++ b/tests/series_only/dtype_test.py @@ -1,13 +1,16 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING import narwhals.stable.v1 as nw +if TYPE_CHECKING: + from tests.utils import ConstructorEager + data = {"a": [1, 3, 2]} -def test_dtype(constructor_eager: Any) -> None: +def test_dtype(constructor_eager: ConstructorEager) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)["a"] result = series.dtype assert result == nw.Int64 diff --git a/tests/series_only/is_empty_test.py b/tests/series_only/is_empty_test.py index 80b8ab799..390fa7f4f 100644 --- a/tests/series_only/is_empty_test.py +++ b/tests/series_only/is_empty_test.py @@ -1,9 +1,8 @@ -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager -def test_is_empty(constructor_eager: Any) -> None: +def test_is_empty(constructor_eager: ConstructorEager) -> None: series = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)["a"] assert not series.is_empty() assert not series[:1].is_empty() diff --git a/tests/series_only/is_ordered_categorical_test.py b/tests/series_only/is_ordered_categorical_test.py index 26358f9a6..10251e362 100644 --- a/tests/series_only/is_ordered_categorical_test.py +++ b/tests/series_only/is_ordered_categorical_test.py @@ -1,5 +1,3 @@ -from typing import Any - import pandas as pd import polars as pl import pyarrow as pa @@ -7,6 +5,7 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +from tests.utils import ConstructorEager def test_is_ordered_categorical() -> None: @@ -39,7 +38,7 @@ def test_is_ordered_categorical_interchange_protocol() -> None: def test_is_definitely_not_ordered_categorical( - constructor_eager: Any, + constructor_eager: ConstructorEager, ) -> None: assert not nw.is_ordered_categorical( nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)["a"] diff --git a/tests/series_only/is_sorted_test.py b/tests/series_only/is_sorted_test.py index 3942b5619..23610ee56 100644 --- a/tests/series_only/is_sorted_test.py +++ b/tests/series_only/is_sorted_test.py @@ -1,10 +1,9 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = [1, 3, 2] @@ -17,7 +16,7 @@ [(data, False, False), (data_sorted, False, True), (data_sorted, True, False)], ) def test_is_sorted( - constructor_eager: Any, + constructor_eager: ConstructorEager, input_data: str, descending: bool, # noqa: FBT001 expected: bool, # noqa: FBT001 @@ -27,7 +26,7 @@ def test_is_sorted( compare_dicts({"a": [result]}, {"a": [expected]}) -def test_is_sorted_invalid(constructor_eager: Any) -> None: +def test_is_sorted_invalid(constructor_eager: ConstructorEager) -> None: series = nw.from_native(constructor_eager({"a": data_sorted}), eager_only=True)["a"] with pytest.raises(TypeError): diff --git a/tests/series_only/item_test.py b/tests/series_only/item_test.py index 869bd7c38..4c199578b 100644 --- a/tests/series_only/item_test.py +++ b/tests/series_only/item_test.py @@ -1,18 +1,18 @@ from __future__ import annotations import re -from typing import Any import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = [1, 3, 2] @pytest.mark.parametrize(("index", "expected"), [(0, 1), (1, 3)]) -def test_item(constructor_eager: Any, index: int, expected: int) -> None: +def test_item(constructor_eager: ConstructorEager, index: int, expected: int) -> None: series = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] result = series.item(index) compare_dicts({"a": [result]}, {"a": [expected]}) diff --git a/tests/series_only/scatter_test.py b/tests/series_only/scatter_test.py index 0677a8dd8..9e4bb08af 100644 --- a/tests/series_only/scatter_test.py +++ b/tests/series_only/scatter_test.py @@ -1,14 +1,15 @@ from __future__ import annotations -from typing import Any - import pytest import narwhals as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts -def test_scatter(constructor_eager: Any, request: pytest.FixtureRequest) -> None: +def test_scatter( + constructor_eager: ConstructorEager, request: pytest.FixtureRequest +) -> None: if "modin" in str(constructor_eager): # https://github.com/modin-project/modin/issues/7392 request.applymarker(pytest.mark.xfail) @@ -26,7 +27,7 @@ def test_scatter(constructor_eager: Any, request: pytest.FixtureRequest) -> None compare_dicts(result, expected) -def test_scatter_unchanged(constructor_eager: Any) -> None: +def test_scatter_unchanged(constructor_eager: ConstructorEager) -> None: df = nw.from_native( constructor_eager({"a": [1, 2, 3], "b": [142, 124, 132]}), eager_only=True ) @@ -40,7 +41,7 @@ def test_scatter_unchanged(constructor_eager: Any) -> None: compare_dicts(df, expected) -def test_single_series(constructor_eager: Any) -> None: +def test_single_series(constructor_eager: ConstructorEager) -> None: df = nw.from_native( constructor_eager({"a": [1, 2, 3], "b": [142, 124, 132]}), eager_only=True ) diff --git a/tests/series_only/shape_test.py b/tests/series_only/shape_test.py index 4a1c0726d..d3e276bb2 100644 --- a/tests/series_only/shape_test.py +++ b/tests/series_only/shape_test.py @@ -1,9 +1,8 @@ -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager -def test_shape(constructor_eager: Any) -> None: +def test_shape(constructor_eager: ConstructorEager) -> None: result = nw.from_native(constructor_eager({"a": [1, 2]}), eager_only=True)["a"].shape expected = (2,) assert result == expected diff --git a/tests/series_only/slice_test.py b/tests/series_only/slice_test.py index 9ae194774..eba24fdbd 100644 --- a/tests/series_only/slice_test.py +++ b/tests/series_only/slice_test.py @@ -1,10 +1,9 @@ -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts -def test_slice(constructor_eager: Any) -> None: +def test_slice(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9], "d": [1, 4, 2]} df = nw.from_native(constructor_eager(data), eager_only=True) result = {"a": df["a"][[0, 1]]} diff --git a/tests/series_only/to_arrow_test.py b/tests/series_only/to_arrow_test.py index 5181a6786..ae6246e55 100644 --- a/tests/series_only/to_arrow_test.py +++ b/tests/series_only/to_arrow_test.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING import pyarrow as pa import pyarrow.compute as pc @@ -8,8 +8,11 @@ import narwhals.stable.v1 as nw +if TYPE_CHECKING: + from tests.utils import ConstructorEager -def test_to_arrow(constructor_eager: Any) -> None: + +def test_to_arrow(constructor_eager: ConstructorEager) -> None: data = [1, 2, 3] result = nw.from_native(constructor_eager({"a": data}), eager_only=True)[ "a" @@ -20,7 +23,7 @@ def test_to_arrow(constructor_eager: Any) -> None: def test_to_arrow_with_nulls( - constructor_eager: Any, request: pytest.FixtureRequest + constructor_eager: ConstructorEager, request: pytest.FixtureRequest ) -> None: if "pandas_constructor" in str(constructor_eager) or "modin_constructor" in str( constructor_eager diff --git a/tests/series_only/to_dummy_test.py b/tests/series_only/to_dummy_test.py index 2cf7f59c7..938b8d04e 100644 --- a/tests/series_only/to_dummy_test.py +++ b/tests/series_only/to_dummy_test.py @@ -1,15 +1,14 @@ -from typing import Any - import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = [1, 2, 3] @pytest.mark.parametrize("sep", ["_", "-"]) -def test_to_dummies(constructor_eager: Any, sep: str) -> None: +def test_to_dummies(constructor_eager: ConstructorEager, sep: str) -> None: s = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"].alias("a") result = s.to_dummies(separator=sep) expected = {f"a{sep}1": [1, 0, 0], f"a{sep}2": [0, 1, 0], f"a{sep}3": [0, 0, 1]} @@ -18,7 +17,7 @@ def test_to_dummies(constructor_eager: Any, sep: str) -> None: @pytest.mark.parametrize("sep", ["_", "-"]) -def test_to_dummies_drop_first(constructor_eager: Any, sep: str) -> None: +def test_to_dummies_drop_first(constructor_eager: ConstructorEager, sep: str) -> None: s = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"].alias("a") result = s.to_dummies(drop_first=True, separator=sep) expected = {f"a{sep}2": [0, 1, 0], f"a{sep}3": [0, 0, 1]} diff --git a/tests/series_only/to_frame_test.py b/tests/series_only/to_frame_test.py index 890036183..065da1414 100644 --- a/tests/series_only/to_frame_test.py +++ b/tests/series_only/to_frame_test.py @@ -1,12 +1,11 @@ -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = [1, 2, 3] -def test_to_frame(constructor_eager: Any) -> None: +def test_to_frame(constructor_eager: ConstructorEager) -> None: df = ( nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] .alias("") diff --git a/tests/series_only/to_list_test.py b/tests/series_only/to_list_test.py index 11d02d0d2..0f91b9879 100644 --- a/tests/series_only/to_list_test.py +++ b/tests/series_only/to_list_test.py @@ -1,14 +1,15 @@ -from typing import Any - import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = [1, 2, 3] -def test_to_list(constructor_eager: Any, request: pytest.FixtureRequest) -> None: +def test_to_list( + constructor_eager: ConstructorEager, request: pytest.FixtureRequest +) -> None: if "cudf" in str(constructor_eager): # pragma: no cover request.applymarker(pytest.mark.xfail) s = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] diff --git a/tests/series_only/to_native_test.py b/tests/series_only/to_native_test.py index 269348ea3..e6955b4c3 100644 --- a/tests/series_only/to_native_test.py +++ b/tests/series_only/to_native_test.py @@ -1,14 +1,17 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING import narwhals.stable.v1 as nw +if TYPE_CHECKING: + from tests.utils import ConstructorEager + data = [4, 4, 4, 1, 6, 6, 4, 4, 1, 1] -def test_to_native(constructor_eager: Any) -> None: - orig_series = constructor_eager({"a": data})["a"] +def test_to_native(constructor_eager: ConstructorEager) -> None: + orig_series = constructor_eager({"a": data})["a"] # type: ignore[index] nw_series = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] result = nw_series.to_native() assert isinstance(result, orig_series.__class__) diff --git a/tests/series_only/to_numpy_test.py b/tests/series_only/to_numpy_test.py index 2f1464a57..966a44449 100644 --- a/tests/series_only/to_numpy_test.py +++ b/tests/series_only/to_numpy_test.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING import numpy as np import pytest @@ -8,8 +8,13 @@ import narwhals.stable.v1 as nw +if TYPE_CHECKING: + from tests.utils import ConstructorEager -def test_to_numpy(constructor_eager: Any, request: pytest.FixtureRequest) -> None: + +def test_to_numpy( + constructor_eager: ConstructorEager, request: pytest.FixtureRequest +) -> None: if ( "pandas_constructor" in str(constructor_eager) or "modin_constructor" in str(constructor_eager) diff --git a/tests/series_only/to_pandas_test.py b/tests/series_only/to_pandas_test.py index 30c7906c7..46d7df6da 100644 --- a/tests/series_only/to_pandas_test.py +++ b/tests/series_only/to_pandas_test.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING import pandas as pd import pytest @@ -9,13 +9,18 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +if TYPE_CHECKING: + from tests.utils import ConstructorEager + data = [1, 3, 2] @pytest.mark.skipif( parse_version(pd.__version__) < parse_version("2.0.0"), reason="too old for pyarrow" ) -def test_convert(request: pytest.FixtureRequest, constructor_eager: Any) -> None: +def test_convert( + request: pytest.FixtureRequest, constructor_eager: ConstructorEager +) -> None: if any( cname in str(constructor_eager) for cname in ("pandas_nullable", "pandas_pyarrow", "modin") diff --git a/tests/series_only/value_counts_test.py b/tests/series_only/value_counts_test.py index d19a1440b..342ad7272 100644 --- a/tests/series_only/value_counts_test.py +++ b/tests/series_only/value_counts_test.py @@ -7,6 +7,7 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +from tests.utils import ConstructorEager from tests.utils import compare_dicts data = [4, 4, 4, 1, 6, 6, 4, 4, 1, 1] @@ -16,7 +17,7 @@ @pytest.mark.parametrize("name", [None, "count_name"]) def test_value_counts( request: pytest.FixtureRequest, - constructor_eager: Any, + constructor_eager: ConstructorEager, normalize: Any, name: str | None, ) -> None: diff --git a/tests/series_only/zip_with_test.py b/tests/series_only/zip_with_test.py index 5d1461da3..2de31c060 100644 --- a/tests/series_only/zip_with_test.py +++ b/tests/series_only/zip_with_test.py @@ -1,12 +1,11 @@ from __future__ import annotations -from typing import Any - import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager from tests.utils import compare_dicts -def test_zip_with(constructor_eager: Any) -> None: +def test_zip_with(constructor_eager: ConstructorEager) -> None: series1 = nw.from_native(constructor_eager({"a": [1, 3, 2]}), eager_only=True)["a"] series2 = nw.from_native(constructor_eager({"a": [4, 4, 6]}), eager_only=True)["a"] mask = nw.from_native(constructor_eager({"a": [True, False, True]}), eager_only=True)[ @@ -18,7 +17,7 @@ def test_zip_with(constructor_eager: Any) -> None: compare_dicts({"a": result}, {"a": expected}) -def test_zip_with_length_1(constructor_eager: Any) -> None: +def test_zip_with_length_1(constructor_eager: ConstructorEager) -> None: series1 = nw.from_native(constructor_eager({"a": [1]}), eager_only=True)["a"] series2 = nw.from_native(constructor_eager({"a": [4]}), eager_only=True)["a"] mask = nw.from_native(constructor_eager({"a": [False]}), eager_only=True)["a"] diff --git a/tests/translate/to_native_test.py b/tests/translate/to_native_test.py index 03d7704ec..90ec11ab1 100644 --- a/tests/translate/to_native_test.py +++ b/tests/translate/to_native_test.py @@ -4,6 +4,7 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager @pytest.mark.parametrize( @@ -20,7 +21,7 @@ ], ) def test_to_native( - constructor_eager: Any, method: str, strict: Any, context: Any + constructor_eager: ConstructorEager, method: str, strict: Any, context: Any ) -> None: df = nw.from_native(constructor_eager({"a": [1, 2, 3]})) diff --git a/tests/utils.py b/tests/utils.py index 15ce25140..302f26f1d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -10,6 +10,7 @@ import pandas as pd +from narwhals.typing import IntoDataFrame from narwhals.typing import IntoFrame from narwhals.utils import Implementation @@ -19,6 +20,7 @@ from typing_extensions import TypeAlias # pragma: no cover Constructor: TypeAlias = Callable[[Any], IntoFrame] +ConstructorEager: TypeAlias = Callable[[Any], IntoDataFrame] def zip_strict(left: Sequence[Any], right: Sequence[Any]) -> Iterator[Any]: @@ -52,7 +54,7 @@ def compare_dicts(result: Any, expected: dict[str, Any]) -> None: rhs = rhs.item() # noqa: PLW2901 if isinstance(lhs, float) and not math.isnan(lhs): assert math.isclose(lhs, rhs, rel_tol=0, abs_tol=1e-6), (lhs, rhs) - elif isinstance(lhs, float) and math.isnan(lhs): + elif isinstance(lhs, float) and math.isnan(lhs) and rhs is not None: assert math.isnan(rhs), (lhs, rhs) # pragma: no cover elif pd.isna(lhs): assert pd.isna(rhs), (lhs, rhs) From 9b628ee06f39b7649b071e94c3a663a8c758c46d Mon Sep 17 00:00:00 2001 From: Zhengbo Wang Date: Thu, 17 Oct 2024 18:56:42 +0800 Subject: [PATCH 126/145] feat: Add 'IntoSeries' (#991) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- .github/workflows/extremes.yml | 2 +- narwhals/stable/v1/__init__.py | 21 +++++++++++---------- narwhals/stable/v1/typing.py | 9 +++++++++ narwhals/translate.py | 21 +++++++++++---------- narwhals/typing.py | 9 +++++++++ 5 files changed, 41 insertions(+), 21 deletions(-) diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index cf488fd2d..3f02f965f 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -90,7 +90,7 @@ jobs: nightlies: strategy: matrix: - python-version: ["3.11"] + python-version: ["3.12"] os: [ubuntu-latest] if: github.event.pull_request.head.repo.full_name == github.repository runs-on: ${{ matrix.os }} diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 93406a145..c500cdab1 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -52,6 +52,7 @@ from narwhals.translate import to_native from narwhals.typing import IntoDataFrameT from narwhals.typing import IntoFrameT +from narwhals.typing import IntoSeriesT from narwhals.utils import is_ordered_categorical as nw_is_ordered_categorical from narwhals.utils import maybe_align_index as nw_maybe_align_index from narwhals.utils import maybe_convert_dtypes as nw_maybe_convert_dtypes @@ -571,26 +572,26 @@ def _stableify( @overload def from_native( - native_dataframe: Any, + native_dataframe: IntoDataFrameT | IntoSeriesT, *, strict: Literal[False], eager_only: None = ..., eager_or_interchange_only: Literal[True], series_only: None = ..., allow_series: Literal[True], -) -> Any: ... +) -> DataFrame[IntoFrameT] | Series: ... @overload def from_native( - native_dataframe: Any, + native_dataframe: IntoDataFrameT | IntoSeriesT, *, strict: Literal[False], eager_only: Literal[True], eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: Literal[True], -) -> Any: ... +) -> DataFrame[IntoDataFrameT] | Series: ... @overload @@ -643,26 +644,26 @@ def from_native( @overload def from_native( - native_dataframe: Any, + native_dataframe: IntoFrameT | IntoSeriesT, *, strict: Literal[False], eager_only: None = ..., eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: Literal[True], -) -> Any: ... +) -> DataFrame[IntoFrameT] | LazyFrame[IntoFrameT] | Series: ... @overload def from_native( - native_dataframe: Any, + native_dataframe: IntoSeriesT, *, strict: Literal[False], eager_only: None = ..., eager_or_interchange_only: None = ..., series_only: Literal[True], allow_series: None = ..., -) -> Any: ... +) -> Series: ... @overload @@ -723,7 +724,7 @@ def from_native( @overload def from_native( - native_dataframe: Any, + native_dataframe: IntoFrameT | IntoSeriesT, *, strict: Literal[True] = ..., eager_only: None = ..., @@ -739,7 +740,7 @@ def from_native( @overload def from_native( - native_dataframe: Any, + native_dataframe: IntoSeriesT | Any, # remain `Any` for downstream compatibility *, strict: Literal[True] = ..., eager_only: None = ..., diff --git a/narwhals/stable/v1/typing.py b/narwhals/stable/v1/typing.py index aebe78fc7..79adf5063 100644 --- a/narwhals/stable/v1/typing.py +++ b/narwhals/stable/v1/typing.py @@ -29,6 +29,9 @@ def columns(self) -> Any: ... def join(self, *args: Any, **kwargs: Any) -> Any: ... + class NativeSeries(Protocol): + def __len__(self) -> int: ... + class DataFrameLike(Protocol): def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ... @@ -47,11 +50,15 @@ def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ... Frame: TypeAlias = Union["DataFrame[Any]", "LazyFrame[Any]"] """Narwhals DataFrame or Narwhals LazyFrame""" +IntoSeries: TypeAlias = Union["Series", "NativeSeries"] +"""Anything which can be converted to a Narwhals Series.""" + # TypeVars for some of the above IntoFrameT = TypeVar("IntoFrameT", bound="IntoFrame") IntoDataFrameT = TypeVar("IntoDataFrameT", bound="IntoDataFrame") FrameT = TypeVar("FrameT", bound="Frame") DataFrameT = TypeVar("DataFrameT", bound="DataFrame[Any]") +IntoSeriesT = TypeVar("IntoSeriesT", bound="IntoSeries") class DTypes: @@ -89,4 +96,6 @@ class DTypes: "Frame", "FrameT", "DataFrameT", + "IntoSeries", + "IntoSeriesT", ] diff --git a/narwhals/translate.py b/narwhals/translate.py index 4c23f6d91..0dc0cd467 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -37,6 +37,7 @@ from narwhals.typing import DTypes from narwhals.typing import IntoDataFrameT from narwhals.typing import IntoFrameT + from narwhals.typing import IntoSeriesT T = TypeVar("T") @@ -86,26 +87,26 @@ def to_native( @overload def from_native( - native_object: Any, + native_object: IntoDataFrameT | IntoSeriesT, *, strict: Literal[False], eager_only: None = ..., eager_or_interchange_only: Literal[True], series_only: None = ..., allow_series: Literal[True], -) -> Any: ... +) -> DataFrame[IntoDataFrameT]: ... @overload def from_native( - native_object: Any, + native_object: IntoDataFrameT | IntoSeriesT, *, strict: Literal[False], eager_only: Literal[True], eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: Literal[True], -) -> Any: ... +) -> DataFrame[IntoDataFrameT] | Series: ... @overload @@ -158,26 +159,26 @@ def from_native( @overload def from_native( - native_object: Any, + native_object: IntoFrameT | IntoSeriesT, *, strict: Literal[False], eager_only: None = ..., eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: Literal[True], -) -> Any: ... +) -> DataFrame[IntoFrameT] | LazyFrame[IntoFrameT] | Series: ... @overload def from_native( - native_object: Any, + native_object: IntoSeriesT, *, strict: Literal[False], eager_only: None = ..., eager_or_interchange_only: None = ..., series_only: Literal[True], allow_series: None = ..., -) -> Any: ... +) -> Series: ... @overload @@ -238,7 +239,7 @@ def from_native( @overload def from_native( - native_object: Any, + native_object: IntoFrameT | IntoSeriesT, *, strict: Literal[True] = ..., eager_only: None = ..., @@ -254,7 +255,7 @@ def from_native( @overload def from_native( - native_object: Any, + native_object: IntoSeriesT, *, strict: Literal[True] = ..., eager_only: None = ..., diff --git a/narwhals/typing.py b/narwhals/typing.py index 8fcbc697c..044962ac3 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -29,6 +29,9 @@ def columns(self) -> Any: ... def join(self, *args: Any, **kwargs: Any) -> Any: ... + class NativeSeries(Protocol): + def __len__(self) -> int: ... + class DataFrameLike(Protocol): def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ... @@ -47,11 +50,15 @@ def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ... Frame: TypeAlias = Union["DataFrame[Any]", "LazyFrame[Any]"] """Narwhals DataFrame or Narwhals LazyFrame""" +IntoSeries: TypeAlias = Union["Series", "NativeSeries"] +"""Anything which can be converted to a Narwhals Series.""" + # TypeVars for some of the above IntoFrameT = TypeVar("IntoFrameT", bound="IntoFrame") IntoDataFrameT = TypeVar("IntoDataFrameT", bound="IntoDataFrame") FrameT = TypeVar("FrameT", bound="Frame") DataFrameT = TypeVar("DataFrameT", bound="DataFrame[Any]") +IntoSeriesT = TypeVar("IntoSeriesT", bound="IntoSeries") class DTypes: @@ -89,4 +96,6 @@ class DTypes: "Frame", "FrameT", "DataFrameT", + "IntoSeries", + "IntoSeriesT", ] From 879d3cf63265f6b758e90eca8c5aca425e5c3a41 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 17 Oct 2024 12:15:12 +0100 Subject: [PATCH 127/145] feat: add from_arrow (which uses the PyCapsule Interface) (#1181) --- docs/api-reference/narwhals.md | 1 + narwhals/__init__.py | 2 + narwhals/functions.py | 101 +++++++++++++++++++++++++++++++++ narwhals/stable/v1/__init__.py | 49 ++++++++++++++++ tests/from_pycapsule_test.py | 45 +++++++++++++++ 5 files changed, 198 insertions(+) create mode 100644 tests/from_pycapsule_test.py diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index 044b20e0a..b8ec2d793 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -14,6 +14,7 @@ Here are the top-level functions available in Narwhals. - concat_str - from_dict - from_native + - from_arrow - get_level - get_native_namespace - is_ordered_categorical diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 3a327aad4..db8bc842c 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -45,6 +45,7 @@ from narwhals.expr import sum_horizontal from narwhals.expr import when from narwhals.functions import concat +from narwhals.functions import from_arrow from narwhals.functions import from_dict from narwhals.functions import get_level from narwhals.functions import new_series @@ -69,6 +70,7 @@ "selectors", "concat", "from_dict", + "from_arrow", "get_level", "new_series", "to_native", diff --git a/narwhals/functions.py b/narwhals/functions.py index b84dcb174..395da97ca 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -6,6 +6,7 @@ from typing import Any from typing import Iterable from typing import Literal +from typing import Protocol from typing import TypeVar from typing import Union @@ -21,6 +22,7 @@ # The rest of the annotations seem to work fine with this anyway FrameT = TypeVar("FrameT", bound=Union[DataFrame, LazyFrame]) # type: ignore[type-arg] + if TYPE_CHECKING: from types import ModuleType @@ -29,6 +31,11 @@ from narwhals.series import Series from narwhals.typing import DTypes + class ArrowStreamExportable(Protocol): + def __arrow_c_stream__( + self, requested_schema: object | None = None + ) -> object: ... + def concat( items: Iterable[FrameT], @@ -406,6 +413,100 @@ def _from_dict_impl( return from_native(native_frame, eager_only=True) +def from_arrow( + native_frame: ArrowStreamExportable, *, native_namespace: ModuleType +) -> DataFrame[Any]: + """ + Construct a DataFrame from an object which supports the PyCapsule Interface. + + Arguments: + native_frame: Object which implements `__arrow_c_stream__`. + native_namespace: The native library to use for DataFrame creation. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} + + Let's define a dataframe-agnostic function which creates a PyArrow + Table. + + >>> @nw.narwhalify + ... def func(df): + ... return nw.from_arrow(df, native_namespace=pa) + + Let's see what happens when passing pandas / Polars input: + + >>> func(pd.DataFrame(data)) # doctest: +SKIP + pyarrow.Table + a: int64 + b: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] + >>> func(pl.DataFrame(data)) # doctest: +SKIP + pyarrow.Table + a: int64 + b: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] + """ + if not hasattr(native_frame, "__arrow_c_stream__"): + msg = f"Given object of type {type(native_frame)} does not support PyCapsule interface" + raise TypeError(msg) + implementation = Implementation.from_native_namespace(native_namespace) + + if implementation is Implementation.POLARS and parse_version( + native_namespace.__version__ + ) >= (1, 3): + native_frame = native_namespace.DataFrame(native_frame) + elif implementation in { + Implementation.PANDAS, + Implementation.MODIN, + Implementation.CUDF, + Implementation.POLARS, + }: + # These don't (yet?) support the PyCapsule Interface for import + # so we go via PyArrow + try: + import pyarrow as pa # ignore-banned-import + except ModuleNotFoundError as exc: # pragma: no cover + msg = f"PyArrow>=14.0.0 is required for `from_arrow` for object of type {native_namespace}" + raise ModuleNotFoundError(msg) from exc + if parse_version(pa.__version__) < (14, 0): # pragma: no cover + msg = f"PyArrow>=14.0.0 is required for `from_arrow` for object of type {native_namespace}" + raise ModuleNotFoundError(msg) from None + + tbl = pa.table(native_frame) + if implementation is Implementation.PANDAS: + native_frame = tbl.to_pandas() + elif implementation is Implementation.MODIN: # pragma: no cover + from modin.pandas.utils import from_arrow + + native_frame = from_arrow(tbl) + elif implementation is Implementation.CUDF: # pragma: no cover + native_frame = native_namespace.DataFrame.from_arrow(tbl) + elif implementation is Implementation.POLARS: # pragma: no cover + native_frame = native_namespace.from_arrow(tbl) + else: # pragma: no cover + msg = "congratulations, you entered unrecheable code - please report a bug" + raise AssertionError(msg) + elif implementation is Implementation.PYARROW: + native_frame = native_namespace.table(native_frame) + else: # pragma: no cover + try: + # implementation is UNKNOWN, Narwhals extension using this feature should + # implement PyCapsule support + native_frame = native_namespace.DataFrame(native_frame) + except AttributeError as e: + msg = "Unknown namespace is expected to implement `DataFrame` class which accepts object which supports PyCapsule Interface." + raise AttributeError(msg) from e + return from_native(native_frame, eager_only=True) + + def _get_sys_info() -> dict[str, str]: """System information diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index c500cdab1..75da2a42c 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -21,6 +21,7 @@ from narwhals.expr import when as nw_when from narwhals.functions import _from_dict_impl from narwhals.functions import _new_series_impl +from narwhals.functions import from_arrow as nw_from_arrow from narwhals.functions import show_versions from narwhals.schema import Schema as NwSchema from narwhals.series import Series as NwSeries @@ -66,6 +67,7 @@ from typing_extensions import Self from narwhals.dtypes import DType + from narwhals.functions import ArrowStreamExportable from narwhals.typing import IntoExpr T = TypeVar("T") @@ -2183,6 +2185,52 @@ def new_series( ) +def from_arrow( + native_frame: ArrowStreamExportable, *, native_namespace: ModuleType +) -> DataFrame[Any]: + """ + Construct a DataFrame from an object which supports the PyCapsule Interface. + + Arguments: + native_frame: Object which implements `__arrow_c_stream__`. + native_namespace: The native library to use for DataFrame creation. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals.stable.v1 as nw + >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} + + Let's define a dataframe-agnostic function which creates a PyArrow + Table. + + >>> @nw.narwhalify + ... def func(df): + ... return nw.from_arrow(df, native_namespace=pa) + + Let's see what happens when passing pandas / Polars input: + + >>> func(pd.DataFrame(data)) # doctest: +SKIP + pyarrow.Table + a: int64 + b: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] + >>> func(pl.DataFrame(data)) # doctest: +SKIP + pyarrow.Table + a: int64 + b: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] + """ + return _stableify( # type: ignore[no-any-return] + nw_from_arrow(native_frame, native_namespace=native_namespace) + ) + + def from_dict( data: dict[str, Any], schema: dict[str, DType] | Schema | None = None, @@ -2307,5 +2355,6 @@ def from_dict( "show_versions", "Schema", "from_dict", + "from_arrow", "new_series", ] diff --git a/tests/from_pycapsule_test.py b/tests/from_pycapsule_test.py new file mode 100644 index 000000000..7ab8f1fe8 --- /dev/null +++ b/tests/from_pycapsule_test.py @@ -0,0 +1,45 @@ +import sys + +import pandas as pd +import polars as pl +import pyarrow as pa +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version +from tests.utils import compare_dicts + + +@pytest.mark.xfail(parse_version(pa.__version__) < (14,), reason="too old") +def test_from_arrow_to_arrow() -> None: + df = nw.from_native(pl.DataFrame({"ab": [1, 2, 3], "ba": [4, 5, 6]}), eager_only=True) + result = nw.from_arrow(df, native_namespace=pa) + assert isinstance(result.to_native(), pa.Table) + expected = {"ab": [1, 2, 3], "ba": [4, 5, 6]} + compare_dicts(result, expected) + + +@pytest.mark.xfail(parse_version(pa.__version__) < (14,), reason="too old") +def test_from_arrow_to_polars(monkeypatch: pytest.MonkeyPatch) -> None: + tbl = pa.table({"ab": [1, 2, 3], "ba": [4, 5, 6]}) + monkeypatch.delitem(sys.modules, "pandas") + df = nw.from_native(tbl, eager_only=True) + result = nw.from_arrow(df, native_namespace=pl) + assert isinstance(result.to_native(), pl.DataFrame) + expected = {"ab": [1, 2, 3], "ba": [4, 5, 6]} + compare_dicts(result, expected) + assert "pandas" not in sys.modules + + +@pytest.mark.xfail(parse_version(pa.__version__) < (14,), reason="too old") +def test_from_arrow_to_pandas() -> None: + df = nw.from_native(pa.table({"ab": [1, 2, 3], "ba": [4, 5, 6]}), eager_only=True) + result = nw.from_arrow(df, native_namespace=pd) + assert isinstance(result.to_native(), pd.DataFrame) + expected = {"ab": [1, 2, 3], "ba": [4, 5, 6]} + compare_dicts(result, expected) + + +def test_from_arrow_invalid() -> None: + with pytest.raises(TypeError, match="PyCapsule"): + nw.from_arrow({"a": [1]}, native_namespace=pa) # type: ignore[arg-type] From e98048355bb5840690a52fba1afec16a7144f894 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 17 Oct 2024 12:15:29 +0100 Subject: [PATCH 128/145] fix: preserve dtypes when using with_columns and length-1 pandas df (#1201) * fix: preserve dtypes when using with_columns and length-1 pandas df * pyarrow versions --- narwhals/_pandas_like/series.py | 4 +--- narwhals/_pandas_like/utils.py | 12 ++++++------ tests/frame/with_columns_test.py | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 2fe53b22a..9dc9f20f6 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -619,9 +619,7 @@ def quantile( def zip_with(self: Self, mask: Any, other: Any) -> PandasLikeSeries: ser = self._native_series - mask = validate_column_comparand( - ser.index, mask, treat_length_one_as_scalar=False - ) + mask = validate_column_comparand(ser.index, mask) other = validate_column_comparand(ser.index, other) res = ser.where(mask, other) return self._from_native_series(res) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 0773764d9..5267dd07f 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -32,9 +32,7 @@ } -def validate_column_comparand( - index: Any, other: Any, *, treat_length_one_as_scalar: bool = True -) -> Any: +def validate_column_comparand(index: Any, other: Any) -> Any: """Validate RHS of binary operation. If the comparison isn't supported, return `NotImplemented` so that the @@ -55,9 +53,10 @@ def validate_column_comparand( if isinstance(other, PandasLikeDataFrame): return NotImplemented if isinstance(other, PandasLikeSeries): - if other.len() == 1 and treat_length_one_as_scalar: + if other.len() == 1: # broadcast - return other.item() + s = other._native_series + return s.__class__(s.iloc[0], index=index, dtype=s.dtype) if other._native_series.index is not index: return set_axis( other._native_series, @@ -83,7 +82,8 @@ def validate_dataframe_comparand(index: Any, other: Any) -> Any: if isinstance(other, PandasLikeSeries): if other.len() == 1: # broadcast - return other._native_series.iloc[0] + s = other._native_series + return s.__class__(s.iloc[0], index=index, dtype=s.dtype) if other._native_series.index is not index: return set_axis( other._native_series, diff --git a/tests/frame/with_columns_test.py b/tests/frame/with_columns_test.py index 44bcd39a5..8c949cc53 100644 --- a/tests/frame/with_columns_test.py +++ b/tests/frame/with_columns_test.py @@ -1,7 +1,10 @@ import numpy as np import pandas as pd +import pyarrow as pa +import pytest import narwhals.stable.v1 as nw +from narwhals.utils import parse_version from tests.utils import Constructor from tests.utils import compare_dicts @@ -40,3 +43,14 @@ def test_with_columns_order_single_row(constructor: Constructor) -> None: assert result.collect_schema().names() == ["a", "b", "z", "d"] expected = {"a": [2], "b": [4], "z": [7.0], "d": [0]} compare_dicts(result, expected) + + +def test_with_columns_dtypes_single_row( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyarrow_table" in str(constructor) and parse_version(pa.__version__) < (15,): + request.applymarker(pytest.mark.xfail) + data = {"a": ["foo"]} + df = nw.from_native(constructor(data)).with_columns(nw.col("a").cast(nw.Categorical)) + result = df.with_columns(nw.col("a")) + assert result.collect_schema() == {"a": nw.Categorical} From 5ef1803cc2fbeb5e39ddeecb3d45172a5c7244a2 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 17 Oct 2024 12:30:22 +0100 Subject: [PATCH 129/145] release: Bump version to 1.9.4 (#1203) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index e3cd8f6db..974eaf1f4 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,7 +13,7 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.9.3' +'1.9.4' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index db8bc842c..21964da15 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -63,7 +63,7 @@ from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index -__version__ = "1.9.3" +__version__ = "1.9.4" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index c4974d8c0..3cbeff8f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.9.3" +version = "1.9.4" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From 696dbff6312d6a300f8a9390b5f8316fd880527f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Thu, 17 Oct 2024 17:33:55 +0200 Subject: [PATCH 130/145] Simplify return function in `_pandas_like.utils.py` (#1204) * list change * combine list+struct * forgot fixed_size_list --- narwhals/_pandas_like/utils.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 5267dd07f..0c5ec4711 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -280,20 +280,7 @@ def native_to_narwhals_dtype(native_column: Any, dtypes: DTypes) -> DType: return dtypes.Duration(du_time_unit) if dtype == "date32[day][pyarrow]": return dtypes.Date() - if dtype.startswith(("large_list", "list")): - return dtypes.List( - arrow_native_to_narwhals_dtype( - native_column.dtype.pyarrow_dtype.value_type, dtypes - ) - ) - if dtype.startswith("fixed_size_list"): - return dtypes.Array( - arrow_native_to_narwhals_dtype( - native_column.dtype.pyarrow_dtype.value_type, dtypes - ), - native_column.dtype.pyarrow_dtype.list_size, - ) - if dtype.startswith("struct"): + if dtype.startswith(("large_list", "list", "struct", "fixed_size_list")): return arrow_native_to_narwhals_dtype(native_column.dtype.pyarrow_dtype, dtypes) if dtype == "object": if ( # pragma: no cover TODO(unassigned): why does this show as uncovered? From 086166bae0860cee31e98eed5720d0c54a46a664 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 17 Oct 2024 17:01:45 +0100 Subject: [PATCH 131/145] docs: add 'commotion' to Zen (#1208) --- narwhals/this.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/narwhals/this.py b/narwhals/this.py index 541ee7704..8ba7aa261 100644 --- a/narwhals/this.py +++ b/narwhals/this.py @@ -6,12 +6,12 @@ ⣿⣿⣿⣿⣿⡇⡼⡘⠛⠿⠿⠿⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ A good API is an honest one ⣿⣿⣿⡿⣫⡄⠾⣣⠹⣿⣿⣿⣶⣮⣙⠻⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ Yes, that needs documenting ⣿⣿⢋⣴⣿⣷⣬⣭⣾⣿⣿⣿⣿⣿⣿⣿⣦⡙⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ People learn better from examples -⣿⢃⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⡌⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ than from explanations⠀ -⡏⠀⢰⠄⢻⣿⣿⣿⣿⡿⠋⢉⠻⣿⣿⣿⣿⣿⣿⡜⣿⣿⡿⢁⢻⣿⣿⣿⣿⣿ If in doubt, say 'no'⠀ -⡇⣌⣀⣠⣾⣿⣿⣿⣿⣇⠶⠉⢁⣿⣿⣿⣿⣿⣿⣧⡹⣿⡇⣿⣧⠻⠿⠿⠿⠿ you can always reconsider⠀ +⣿⢃⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⡌⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ than from explanations⠀ +⡏⠀⢰⠄⢻⣿⣿⣿⣿⡿⠋⢉⠻⣿⣿⣿⣿⣿⣿⡜⣿⣿⡿⢁⢻⣿⣿⣿⣿⣿ If in doubt, better to say 'no' +⡇⣌⣀⣠⣾⣿⣿⣿⣿⣇⠶⠉⢁⣿⣿⣿⣿⣿⣿⣧⡹⣿⡇⣿⣧⠻⠿⠿⠿⠿ than to risk causing a commotion⠀ ⡧⢹⣿⣿⣿⣜⣟⣸⣿⣿⣷⣶⣿⡿⣿⣿⣝⢿⣿⣿⣷⣬⣥⣿⣿⣿⣿⣿⡟⣰ Yes, we need a test for that ⢡⣆⢻⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣧⡙⣿⣿⡇⣿⣿⣿⣿⠟⣋⣭⣛⠻⣋⣴⣿ If you want users -⣶⣤⣤⣙⠻⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⣦⣍⣡⣿⡿⢋⣴⣿⣿⣿⣿⣿⣿⣿⣿ you need good docs⠀ +⣶⣤⣤⣙⠻⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⣦⣍⣡⣿⡿⢋⣴⣿⣿⣿⣿⣿⣿⣿⣿ you need good docs⠀ ⣿⣿⣿⣿⣿⣶⣬⣙⣛⠻⠿⠿⠿⠿⠿⠟⣛⣩⣥⣶⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿ Our code is not irreplaceable""" print(ZEN) From 80aad57a3b91b211047936fd75ba1cdd581baaa9 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Thu, 17 Oct 2024 18:10:03 +0200 Subject: [PATCH 132/145] chore: tests cleanup and add `from __future__ import annotations` (#1206) --- narwhals/__init__.py | 2 + narwhals/stable/__init__.py | 2 + narwhals/stable/v1/_dtypes.py | 2 + narwhals/stable/v1/dtypes.py | 2 + noxfile.py | 8 ++- pyproject.toml | 1 + tests/conftest.py | 15 ++++-- .../dependencies/is_pandas_dataframe_test.py | 2 + tests/expr_and_series/abs_test.py | 2 + tests/expr_and_series/all_horizontal_test.py | 4 +- tests/expr_and_series/any_all_test.py | 2 + tests/expr_and_series/any_horizontal_test.py | 2 + tests/expr_and_series/arg_true_test.py | 2 + tests/expr_and_series/binary_test.py | 2 + tests/expr_and_series/clip_test.py | 2 + tests/expr_and_series/count_test.py | 2 + tests/expr_and_series/cum_sum_test.py | 2 + tests/expr_and_series/diff_test.py | 2 + tests/expr_and_series/double_selected_test.py | 2 + tests/expr_and_series/double_test.py | 2 + tests/expr_and_series/fill_null_test.py | 2 + tests/expr_and_series/filter_test.py | 2 + tests/expr_and_series/gather_every_test.py | 2 + tests/expr_and_series/is_duplicated_test.py | 2 + .../expr_and_series/is_first_distinct_test.py | 2 + tests/expr_and_series/is_in_test.py | 2 + .../expr_and_series/is_last_distinct_test.py | 2 + tests/expr_and_series/is_null_test.py | 2 + tests/expr_and_series/is_unique_test.py | 2 + tests/expr_and_series/len_test.py | 2 + tests/expr_and_series/max_horizontal_test.py | 2 + tests/expr_and_series/mean_horizontal_test.py | 2 + tests/expr_and_series/min_horizontal_test.py | 8 +-- tests/expr_and_series/mode_test.py | 2 + tests/expr_and_series/n_unique_test.py | 2 + tests/expr_and_series/null_count_test.py | 2 + tests/expr_and_series/over_test.py | 2 + tests/expr_and_series/pipe_test.py | 2 + tests/expr_and_series/sample_test.py | 2 + tests/expr_and_series/shift_test.py | 2 + tests/expr_and_series/sort_test.py | 2 + tests/expr_and_series/std_test.py | 2 + tests/expr_and_series/str/contains_test.py | 2 + tests/expr_and_series/str/head_test.py | 2 + tests/expr_and_series/str/len_chars_test.py | 2 + tests/expr_and_series/str/tail_test.py | 2 + tests/expr_and_series/sum_horizontal_test.py | 2 + tests/expr_and_series/tail_test.py | 2 + tests/expr_and_series/unary_test.py | 19 +++---- tests/expr_and_series/unique_test.py | 2 + tests/frame/add_test.py | 2 + tests/frame/array_dunder_test.py | 2 + tests/frame/arrow_c_stream_test.py | 2 + tests/frame/clone_test.py | 2 + tests/frame/columns_test.py | 8 ++- tests/frame/concat_test.py | 2 + tests/frame/double_test.py | 2 + tests/frame/filter_test.py | 2 + tests/frame/gather_every_test.py | 2 + tests/frame/get_column_test.py | 2 + .../interchange_native_namespace_test.py | 2 + tests/frame/interchange_schema_test.py | 2 + tests/frame/interchange_to_arrow_test.py | 2 + tests/frame/interchange_to_pandas_test.py | 2 + tests/frame/invalid_test.py | 2 + tests/frame/lazy_test.py | 8 ++- tests/frame/len_test.py | 7 ++- tests/frame/pipe_test.py | 2 + tests/frame/reindex_test.py | 2 + tests/frame/rename_test.py | 2 + tests/frame/row_test.py | 7 ++- tests/frame/rows_test.py | 51 ++++--------------- tests/frame/sample_test.py | 8 ++- tests/frame/schema_test.py | 10 +++- tests/frame/select_test.py | 2 + tests/frame/shape_test.py | 8 ++- tests/frame/to_dict_test.py | 2 + tests/frame/to_native_test.py | 8 ++- tests/frame/with_columns_sequence_test.py | 2 + tests/frame/with_columns_test.py | 2 + tests/frame/with_row_index_test.py | 2 + tests/from_dict_test.py | 2 + tests/from_pycapsule_test.py | 2 + tests/new_series_test.py | 2 + tests/no_imports_test.py | 2 + tests/series_only/alias_rename_test.py | 2 + tests/series_only/array_dunder_test.py | 2 + tests/series_only/arrow_c_stream_test.py | 2 + tests/series_only/cast_test.py | 2 + tests/series_only/is_empty_test.py | 8 ++- .../is_ordered_categorical_test.py | 8 ++- tests/series_only/shape_test.py | 8 ++- tests/series_only/slice_test.py | 2 + tests/series_only/to_dummy_test.py | 2 + tests/series_only/to_frame_test.py | 2 + tests/series_only/to_list_test.py | 2 + tests/stable_api_test.py | 2 + tests/system_info_test.py | 2 + tests/translate/from_native_test.py | 2 + tests/translate/get_native_namespace_test.py | 2 + tests/translate/to_native_test.py | 7 ++- tests/utils_test.py | 2 + tpch/__init__.py | 0 tpch/execute/__init__.py | 2 + tpch/execute/q1.py | 2 + tpch/execute/q10.py | 2 + tpch/execute/q11.py | 2 + tpch/execute/q12.py | 2 + tpch/execute/q13.py | 2 + tpch/execute/q14.py | 2 + tpch/execute/q15.py | 2 + tpch/execute/q16.py | 2 + tpch/execute/q17.py | 2 + tpch/execute/q18.py | 2 + tpch/execute/q19.py | 2 + tpch/execute/q2.py | 2 + tpch/execute/q20.py | 2 + tpch/execute/q21.py | 2 + tpch/execute/q22.py | 2 + tpch/execute/q3.py | 2 + tpch/execute/q4.py | 2 + tpch/execute/q5.py | 2 + tpch/execute/q6.py | 2 + tpch/execute/q7.py | 2 + tpch/execute/q8.py | 2 + tpch/execute/q9.py | 2 + tpch/generate_data.py | 4 +- tpch/queries/q1.py | 7 ++- tpch/queries/q10.py | 7 ++- tpch/queries/q11.py | 8 ++- tpch/queries/q12.py | 7 ++- tpch/queries/q13.py | 8 ++- tpch/queries/q14.py | 7 ++- tpch/queries/q15.py | 7 ++- tpch/queries/q16.py | 8 ++- tpch/queries/q17.py | 8 ++- tpch/queries/q18.py | 8 ++- tpch/queries/q19.py | 8 ++- tpch/queries/q2.py | 8 ++- tpch/queries/q20.py | 7 ++- tpch/queries/q21.py | 8 ++- tpch/queries/q22.py | 8 ++- tpch/queries/q3.py | 7 ++- tpch/queries/q4.py | 7 ++- tpch/queries/q5.py | 7 ++- tpch/queries/q6.py | 7 ++- tpch/queries/q7.py | 7 ++- tpch/queries/q8.py | 7 ++- tpch/queries/q9.py | 8 ++- tpch/tests/queries_test.py | 2 + utils/check_api_reference.py | 4 +- utils/check_for_no_build_errors.py | 2 + utils/generate_random_versions.py | 2 + 153 files changed, 493 insertions(+), 98 deletions(-) create mode 100644 tpch/__init__.py diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 21964da15..aeba3ef5e 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from narwhals import dependencies from narwhals import selectors from narwhals import stable diff --git a/narwhals/stable/__init__.py b/narwhals/stable/__init__.py index 572034fe7..60bc872a5 100644 --- a/narwhals/stable/__init__.py +++ b/narwhals/stable/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from narwhals.stable import v1 __all__ = ["v1"] diff --git a/narwhals/stable/v1/_dtypes.py b/narwhals/stable/v1/_dtypes.py index 84c9adc90..459441d66 100644 --- a/narwhals/stable/v1/_dtypes.py +++ b/narwhals/stable/v1/_dtypes.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from narwhals.dtypes import Array from narwhals.dtypes import Boolean from narwhals.dtypes import Categorical diff --git a/narwhals/stable/v1/dtypes.py b/narwhals/stable/v1/dtypes.py index 21bd1c5ed..37c3af0e8 100644 --- a/narwhals/stable/v1/dtypes.py +++ b/narwhals/stable/v1/dtypes.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from narwhals.stable.v1._dtypes import Array from narwhals.stable.v1._dtypes import Boolean from narwhals.stable.v1._dtypes import Categorical diff --git a/noxfile.py b/noxfile.py index 1dc37b29d..1fb820c65 100644 --- a/noxfile.py +++ b/noxfile.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import nox -from nox.sessions import Session + +if TYPE_CHECKING: + from nox.sessions import Session nox.options.default_venv_backend = "uv" nox.options.reuse_venv = True diff --git a/pyproject.toml b/pyproject.toml index 3cbeff8f5..d525fb677 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,7 @@ convention = "google" [tool.ruff.lint.isort] force-single-line = true +required-imports = ["from __future__ import annotations"] [tool.ruff.format] docstring-code-format = true diff --git a/tests/conftest.py b/tests/conftest.py index 85c296daf..18ef366cc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,7 @@ +from __future__ import annotations + import contextlib +from typing import TYPE_CHECKING from typing import Any from typing import Callable @@ -10,10 +13,12 @@ from narwhals.dependencies import get_cudf from narwhals.dependencies import get_dask_dataframe from narwhals.dependencies import get_modin -from narwhals.typing import IntoDataFrame -from narwhals.typing import IntoFrame from narwhals.utils import parse_version -from tests.utils import Constructor + +if TYPE_CHECKING: + from narwhals.typing import IntoDataFrame + from narwhals.typing import IntoFrame + from tests.utils import Constructor with contextlib.suppress(ImportError): import modin.pandas # noqa: F401 @@ -108,7 +113,9 @@ def pyarrow_table_constructor(obj: Any) -> IntoDataFrame: @pytest.fixture(params=eager_constructors) -def constructor_eager(request: pytest.FixtureRequest) -> Callable[[Any], IntoDataFrame]: +def constructor_eager( + request: pytest.FixtureRequest, +) -> Callable[[Any], IntoDataFrame]: return request.param # type: ignore[no-any-return] diff --git a/tests/dependencies/is_pandas_dataframe_test.py b/tests/dependencies/is_pandas_dataframe_test.py index a8ffaa739..96b874952 100644 --- a/tests/dependencies/is_pandas_dataframe_test.py +++ b/tests/dependencies/is_pandas_dataframe_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pandas as pd import polars as pl diff --git a/tests/expr_and_series/abs_test.py b/tests/expr_and_series/abs_test.py index c883d7161..c324a9cfd 100644 --- a/tests/expr_and_series/abs_test.py +++ b/tests/expr_and_series/abs_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/all_horizontal_test.py b/tests/expr_and_series/all_horizontal_test.py index a5ba44600..beeaecca7 100644 --- a/tests/expr_and_series/all_horizontal_test.py +++ b/tests/expr_and_series/all_horizontal_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import polars as pl @@ -66,7 +68,7 @@ def test_allh_nth(constructor: Constructor, request: pytest.FixtureRequest) -> N compare_dicts(result, expected) -def test_horizontal_expressions_emtpy(constructor: Constructor) -> None: +def test_horizontal_expressions_empty(constructor: Constructor) -> None: data = { "a": [False, False, True], "b": [False, True, True], diff --git a/tests/expr_and_series/any_all_test.py b/tests/expr_and_series/any_all_test.py index 73294c708..2406cdcff 100644 --- a/tests/expr_and_series/any_all_test.py +++ b/tests/expr_and_series/any_all_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/any_horizontal_test.py b/tests/expr_and_series/any_horizontal_test.py index cd360bf66..d98cd34d6 100644 --- a/tests/expr_and_series/any_horizontal_test.py +++ b/tests/expr_and_series/any_horizontal_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import pytest diff --git a/tests/expr_and_series/arg_true_test.py b/tests/expr_and_series/arg_true_test.py index 1f71e2c42..ba6b5d68d 100644 --- a/tests/expr_and_series/arg_true_test.py +++ b/tests/expr_and_series/arg_true_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/expr_and_series/binary_test.py b/tests/expr_and_series/binary_test.py index 1ce76d9d2..6826cda37 100644 --- a/tests/expr_and_series/binary_test.py +++ b/tests/expr_and_series/binary_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import compare_dicts diff --git a/tests/expr_and_series/clip_test.py b/tests/expr_and_series/clip_test.py index 2406f289f..14496fc49 100644 --- a/tests/expr_and_series/clip_test.py +++ b/tests/expr_and_series/clip_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/count_test.py b/tests/expr_and_series/count_test.py index ec90e1fc1..603a6daf8 100644 --- a/tests/expr_and_series/count_test.py +++ b/tests/expr_and_series/count_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/cum_sum_test.py b/tests/expr_and_series/cum_sum_test.py index a490b890e..e94bd168c 100644 --- a/tests/expr_and_series/cum_sum_test.py +++ b/tests/expr_and_series/cum_sum_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/diff_test.py b/tests/expr_and_series/diff_test.py index ada3147ed..c62b68d40 100644 --- a/tests/expr_and_series/diff_test.py +++ b/tests/expr_and_series/diff_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pyarrow as pa import pytest diff --git a/tests/expr_and_series/double_selected_test.py b/tests/expr_and_series/double_selected_test.py index 88826fb40..001e1f848 100644 --- a/tests/expr_and_series/double_selected_test.py +++ b/tests/expr_and_series/double_selected_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import compare_dicts diff --git a/tests/expr_and_series/double_test.py b/tests/expr_and_series/double_test.py index 8f19e0202..66af086db 100644 --- a/tests/expr_and_series/double_test.py +++ b/tests/expr_and_series/double_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import compare_dicts diff --git a/tests/expr_and_series/fill_null_test.py b/tests/expr_and_series/fill_null_test.py index 9fa7afaf9..a6315ae59 100644 --- a/tests/expr_and_series/fill_null_test.py +++ b/tests/expr_and_series/fill_null_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/filter_test.py b/tests/expr_and_series/filter_test.py index dff987ecb..afddff244 100644 --- a/tests/expr_and_series/filter_test.py +++ b/tests/expr_and_series/filter_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/expr_and_series/gather_every_test.py b/tests/expr_and_series/gather_every_test.py index e6f68be1d..2a2ce154b 100644 --- a/tests/expr_and_series/gather_every_test.py +++ b/tests/expr_and_series/gather_every_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/expr_and_series/is_duplicated_test.py b/tests/expr_and_series/is_duplicated_test.py index d0c5ae3dc..d5c934a04 100644 --- a/tests/expr_and_series/is_duplicated_test.py +++ b/tests/expr_and_series/is_duplicated_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/is_first_distinct_test.py b/tests/expr_and_series/is_first_distinct_test.py index 4f22d02f9..c4ad865e3 100644 --- a/tests/expr_and_series/is_first_distinct_test.py +++ b/tests/expr_and_series/is_first_distinct_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/is_in_test.py b/tests/expr_and_series/is_in_test.py index 29d3cf56b..6a568053a 100644 --- a/tests/expr_and_series/is_in_test.py +++ b/tests/expr_and_series/is_in_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/expr_and_series/is_last_distinct_test.py b/tests/expr_and_series/is_last_distinct_test.py index e63c161b3..efad08dcb 100644 --- a/tests/expr_and_series/is_last_distinct_test.py +++ b/tests/expr_and_series/is_last_distinct_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/is_null_test.py b/tests/expr_and_series/is_null_test.py index a3d5d2bae..edc0e8953 100644 --- a/tests/expr_and_series/is_null_test.py +++ b/tests/expr_and_series/is_null_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/is_unique_test.py b/tests/expr_and_series/is_unique_test.py index 8d46db92d..39d6fc071 100644 --- a/tests/expr_and_series/is_unique_test.py +++ b/tests/expr_and_series/is_unique_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/len_test.py b/tests/expr_and_series/len_test.py index 535c7dc92..8d582ce1c 100644 --- a/tests/expr_and_series/len_test.py +++ b/tests/expr_and_series/len_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/expr_and_series/max_horizontal_test.py b/tests/expr_and_series/max_horizontal_test.py index 711ce4e0d..8da95e317 100644 --- a/tests/expr_and_series/max_horizontal_test.py +++ b/tests/expr_and_series/max_horizontal_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import pytest diff --git a/tests/expr_and_series/mean_horizontal_test.py b/tests/expr_and_series/mean_horizontal_test.py index ce9ac8fe0..eb78a868e 100644 --- a/tests/expr_and_series/mean_horizontal_test.py +++ b/tests/expr_and_series/mean_horizontal_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import pytest diff --git a/tests/expr_and_series/min_horizontal_test.py b/tests/expr_and_series/min_horizontal_test.py index ca34d440d..eaad0528f 100644 --- a/tests/expr_and_series/min_horizontal_test.py +++ b/tests/expr_and_series/min_horizontal_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import pytest @@ -7,19 +9,19 @@ from tests.utils import compare_dicts data = {"a": [1, 3, None, None], "b": [4, None, 6, None], "z": [3, 1, None, None]} -expcted_values = [1, 1, 6, float("nan")] +expected_values = [1, 1, 6, float("nan")] @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) def test_minh(constructor: Constructor, col_expr: Any) -> None: df = nw.from_native(constructor(data)) result = df.select(horizontal_min=nw.min_horizontal(col_expr, nw.col("b"), "z")) - expected = {"horizontal_min": expcted_values} + expected = {"horizontal_min": expected_values} compare_dicts(result, expected) def test_minh_all(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.min_horizontal(nw.all()), c=nw.min_horizontal(nw.all())) - expected = {"a": expcted_values, "c": expcted_values} + expected = {"a": expected_values, "c": expected_values} compare_dicts(result, expected) diff --git a/tests/expr_and_series/mode_test.py b/tests/expr_and_series/mode_test.py index 820e05ad8..2e752ebb9 100644 --- a/tests/expr_and_series/mode_test.py +++ b/tests/expr_and_series/mode_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import polars as pl import pytest diff --git a/tests/expr_and_series/n_unique_test.py b/tests/expr_and_series/n_unique_test.py index c4199eec1..d54e815cc 100644 --- a/tests/expr_and_series/n_unique_test.py +++ b/tests/expr_and_series/n_unique_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/null_count_test.py b/tests/expr_and_series/null_count_test.py index 93d467cb3..28aa66f38 100644 --- a/tests/expr_and_series/null_count_test.py +++ b/tests/expr_and_series/null_count_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/over_test.py b/tests/expr_and_series/over_test.py index 2abc9a699..4f89c29e5 100644 --- a/tests/expr_and_series/over_test.py +++ b/tests/expr_and_series/over_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from contextlib import nullcontext as does_not_raise import pytest diff --git a/tests/expr_and_series/pipe_test.py b/tests/expr_and_series/pipe_test.py index 84b6006d7..812422f7f 100644 --- a/tests/expr_and_series/pipe_test.py +++ b/tests/expr_and_series/pipe_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/sample_test.py b/tests/expr_and_series/sample_test.py index eb6d853ec..c228ca0bd 100644 --- a/tests/expr_and_series/sample_test.py +++ b/tests/expr_and_series/sample_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/expr_and_series/shift_test.py b/tests/expr_and_series/shift_test.py index a665ff768..388b8e6ab 100644 --- a/tests/expr_and_series/shift_test.py +++ b/tests/expr_and_series/shift_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pyarrow as pa import narwhals.stable.v1 as nw diff --git a/tests/expr_and_series/sort_test.py b/tests/expr_and_series/sort_test.py index 2ea8cd145..3721c2599 100644 --- a/tests/expr_and_series/sort_test.py +++ b/tests/expr_and_series/sort_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import pytest diff --git a/tests/expr_and_series/std_test.py b/tests/expr_and_series/std_test.py index 09779c109..9ed57c571 100644 --- a/tests/expr_and_series/std_test.py +++ b/tests/expr_and_series/std_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/str/contains_test.py b/tests/expr_and_series/str/contains_test.py index 139b71eb8..2c2e0cb9f 100644 --- a/tests/expr_and_series/str/contains_test.py +++ b/tests/expr_and_series/str/contains_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pandas as pd import polars as pl import pytest diff --git a/tests/expr_and_series/str/head_test.py b/tests/expr_and_series/str/head_test.py index 8da64553e..00406e9d4 100644 --- a/tests/expr_and_series/str/head_test.py +++ b/tests/expr_and_series/str/head_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/str/len_chars_test.py b/tests/expr_and_series/str/len_chars_test.py index 80a791c61..f95efd1a2 100644 --- a/tests/expr_and_series/str/len_chars_test.py +++ b/tests/expr_and_series/str/len_chars_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/str/tail_test.py b/tests/expr_and_series/str/tail_test.py index 260ab745c..aa0821075 100644 --- a/tests/expr_and_series/str/tail_test.py +++ b/tests/expr_and_series/str/tail_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager diff --git a/tests/expr_and_series/sum_horizontal_test.py b/tests/expr_and_series/sum_horizontal_test.py index e9e1e4a3c..91d0d3bb9 100644 --- a/tests/expr_and_series/sum_horizontal_test.py +++ b/tests/expr_and_series/sum_horizontal_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import pytest diff --git a/tests/expr_and_series/tail_test.py b/tests/expr_and_series/tail_test.py index 73acb6848..8a7ae8f5b 100644 --- a/tests/expr_and_series/tail_test.py +++ b/tests/expr_and_series/tail_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals as nw diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index c1e1d007b..71a00f8f3 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -6,17 +8,12 @@ def test_unary(constructor: Constructor) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - result = ( - nw.from_native(constructor(data)) - .with_columns( - a_mean=nw.col("a").mean(), - a_sum=nw.col("a").sum(), - b_nunique=nw.col("b").n_unique(), - z_min=nw.col("z").min(), - z_max=nw.col("z").max(), - ) - .unique(["a_mean", "a_sum", "b_nunique", "z_min", "z_max"]) - .select(["a_mean", "a_sum", "b_nunique", "z_min", "z_max"]) + result = nw.from_native(constructor(data)).select( + a_mean=nw.col("a").mean(), + a_sum=nw.col("a").sum(), + b_nunique=nw.col("b").n_unique(), + z_min=nw.col("z").min(), + z_max=nw.col("z").max(), ) expected = { "a_mean": [2], diff --git a/tests/expr_and_series/unique_test.py b/tests/expr_and_series/unique_test.py index 5048d3250..db0478e80 100644 --- a/tests/expr_and_series/unique_test.py +++ b/tests/expr_and_series/unique_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/frame/add_test.py b/tests/frame/add_test.py index c95fbae97..69133c2e8 100644 --- a/tests/frame/add_test.py +++ b/tests/frame/add_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import compare_dicts diff --git a/tests/frame/array_dunder_test.py b/tests/frame/array_dunder_test.py index ad3085f56..90db2b621 100644 --- a/tests/frame/array_dunder_test.py +++ b/tests/frame/array_dunder_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import numpy as np import pandas as pd import polars as pl diff --git a/tests/frame/arrow_c_stream_test.py b/tests/frame/arrow_c_stream_test.py index cb856adf9..66525f1b9 100644 --- a/tests/frame/arrow_c_stream_test.py +++ b/tests/frame/arrow_c_stream_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import polars as pl import pyarrow as pa import pyarrow.compute as pc diff --git a/tests/frame/clone_test.py b/tests/frame/clone_test.py index e94183e2e..c115d0899 100644 --- a/tests/frame/clone_test.py +++ b/tests/frame/clone_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/frame/columns_test.py b/tests/frame/columns_test.py index 90a9c922d..3a18fb591 100644 --- a/tests/frame/columns_test.py +++ b/tests/frame/columns_test.py @@ -1,7 +1,13 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import pytest import narwhals.stable.v1 as nw -from tests.utils import Constructor + +if TYPE_CHECKING: + from tests.utils import Constructor @pytest.mark.filterwarnings("ignore:Determining|Resolving.*") diff --git a/tests/frame/concat_test.py b/tests/frame/concat_test.py index 926f3f988..ebf4bcb05 100644 --- a/tests/frame/concat_test.py +++ b/tests/frame/concat_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/frame/double_test.py b/tests/frame/double_test.py index 6840145ec..1c46bf3f7 100644 --- a/tests/frame/double_test.py +++ b/tests/frame/double_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import compare_dicts diff --git a/tests/frame/filter_test.py b/tests/frame/filter_test.py index 9c9b1b6fd..3f10fba8a 100644 --- a/tests/frame/filter_test.py +++ b/tests/frame/filter_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from contextlib import nullcontext as does_not_raise import pytest diff --git a/tests/frame/gather_every_test.py b/tests/frame/gather_every_test.py index 40e18a30b..347132c14 100644 --- a/tests/frame/gather_every_test.py +++ b/tests/frame/gather_every_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/frame/get_column_test.py b/tests/frame/get_column_test.py index ff4ebc506..b0a2a7ca5 100644 --- a/tests/frame/get_column_test.py +++ b/tests/frame/get_column_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pandas as pd import pytest diff --git a/tests/frame/interchange_native_namespace_test.py b/tests/frame/interchange_native_namespace_test.py index 8a67d07b8..084f6ea05 100644 --- a/tests/frame/interchange_native_namespace_test.py +++ b/tests/frame/interchange_native_namespace_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import duckdb import polars as pl import pytest diff --git a/tests/frame/interchange_schema_test.py b/tests/frame/interchange_schema_test.py index afec06831..33f2e0044 100644 --- a/tests/frame/interchange_schema_test.py +++ b/tests/frame/interchange_schema_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import date from datetime import datetime from datetime import timedelta diff --git a/tests/frame/interchange_to_arrow_test.py b/tests/frame/interchange_to_arrow_test.py index 7308607ea..d1ddd2a53 100644 --- a/tests/frame/interchange_to_arrow_test.py +++ b/tests/frame/interchange_to_arrow_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import duckdb import polars as pl import pyarrow as pa diff --git a/tests/frame/interchange_to_pandas_test.py b/tests/frame/interchange_to_pandas_test.py index f56575fa3..3cb722b1c 100644 --- a/tests/frame/interchange_to_pandas_test.py +++ b/tests/frame/interchange_to_pandas_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import duckdb import pandas as pd import pytest diff --git a/tests/frame/invalid_test.py b/tests/frame/invalid_test.py index 2fdf53949..834e192b7 100644 --- a/tests/frame/invalid_test.py +++ b/tests/frame/invalid_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import numpy as np import pandas as pd import polars as pl diff --git a/tests/frame/lazy_test.py b/tests/frame/lazy_test.py index 8f1566e69..df27a4cc9 100644 --- a/tests/frame/lazy_test.py +++ b/tests/frame/lazy_test.py @@ -1,6 +1,12 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw import narwhals.stable.v1 as nw_v1 -from tests.utils import ConstructorEager + +if TYPE_CHECKING: + from tests.utils import ConstructorEager def test_lazy(constructor_eager: ConstructorEager) -> None: diff --git a/tests/frame/len_test.py b/tests/frame/len_test.py index cd082ef2e..b22f0c67d 100644 --- a/tests/frame/len_test.py +++ b/tests/frame/len_test.py @@ -1,6 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals.stable.v1 as nw -from tests.utils import ConstructorEager +if TYPE_CHECKING: + from tests.utils import ConstructorEager data = { "a": [1.0, 2.0, None, 4.0], "b": [None, 3.0, None, 5.0], diff --git a/tests/frame/pipe_test.py b/tests/frame/pipe_test.py index b7b57e0a1..506d4a317 100644 --- a/tests/frame/pipe_test.py +++ b/tests/frame/pipe_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import compare_dicts diff --git a/tests/frame/reindex_test.py b/tests/frame/reindex_test.py index e21b31a8e..431e7b002 100644 --- a/tests/frame/reindex_test.py +++ b/tests/frame/reindex_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import pandas as pd diff --git a/tests/frame/rename_test.py b/tests/frame/rename_test.py index 79cf3f243..d51e86f83 100644 --- a/tests/frame/rename_test.py +++ b/tests/frame/rename_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import compare_dicts diff --git a/tests/frame/row_test.py b/tests/frame/row_test.py index d977a81f1..82af94146 100644 --- a/tests/frame/row_test.py +++ b/tests/frame/row_test.py @@ -1,9 +1,14 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING from typing import Any import pytest import narwhals.stable.v1 as nw -from tests.utils import ConstructorEager + +if TYPE_CHECKING: + from tests.utils import ConstructorEager def test_row_column(request: Any, constructor_eager: ConstructorEager) -> None: diff --git a/tests/frame/rows_test.py b/tests/frame/rows_test.py index 60e18658c..7e5c1ecef 100644 --- a/tests/frame/rows_test.py +++ b/tests/frame/rows_test.py @@ -4,44 +4,15 @@ from typing import Any import pandas as pd -import polars as pl -import pyarrow as pa import pytest import narwhals.stable.v1 as nw -from narwhals.utils import parse_version if TYPE_CHECKING: from tests.utils import ConstructorEager -df_pandas = pd.DataFrame({"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}) -df_pa = pa.table({"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}) -if parse_version(pd.__version__) >= parse_version("1.5.0"): - df_pandas_pyarrow = pd.DataFrame( - {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - ).astype( - { - "a": "Int64[pyarrow]", - "b": "Int64[pyarrow]", - "z": "Float64[pyarrow]", - } - ) - df_pandas_nullable = pd.DataFrame( - {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - ).astype( - { - "a": "Int64", - "b": "Int64", - "z": "Float64", - } - ) -else: # pragma: no cover - df_pandas_pyarrow = df_pandas - df_pandas_nullable = df_pandas -df_polars = pl.DataFrame({"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}) - -df_pandas_na = pd.DataFrame({"a": [None, 3, 2], "b": [4, 4, 6], "z": [7.0, None, 9]}) -df_polars_na = pl.DataFrame({"a": [None, 3, 2], "b": [4, 4, 6], "z": [7.0, None, 9]}) +data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} +data_na = {"a": [None, 3, 2], "b": [4, 4, 6], "z": [7.0, None, 9]} @pytest.mark.parametrize( @@ -73,8 +44,8 @@ def test_iter_rows( assert result == expected -@pytest.mark.parametrize( - "df_raw", [df_pandas, df_pandas_nullable, df_pandas_pyarrow, df_polars, df_pa] +@pytest.mark.filterwarnings( + "ignore:.*all arguments of to_dict except for the argument:FutureWarning" ) @pytest.mark.parametrize( ("named", "expected"), @@ -91,19 +62,18 @@ def test_iter_rows( ], ) def test_rows( - df_raw: Any, + constructor_eager: ConstructorEager, named: bool, # noqa: FBT001 expected: list[tuple[Any, ...]] | list[dict[str, Any]], ) -> None: - df = nw.from_native(df_raw, eager_only=True) + df = nw.from_native(constructor_eager(data), eager_only=True) result = df.rows(named=named) assert result == expected -@pytest.mark.parametrize("df_raw", [df_pandas_na, df_polars_na]) -def test_rows_with_nulls_unnamed(df_raw: Any) -> None: +def test_rows_with_nulls_unnamed(constructor_eager: ConstructorEager) -> None: # GIVEN - df = nw.from_native(df_raw, eager_only=True) + df = nw.from_native(constructor_eager(data_na), eager_only=True) # WHEN result = list(df.iter_rows(named=False)) @@ -119,10 +89,9 @@ def test_rows_with_nulls_unnamed(df_raw: Any) -> None: assert value_in_result == value -@pytest.mark.parametrize("df_raw", [df_pandas_na, df_polars_na]) -def test_rows_with_nulls_named(df_raw: Any) -> None: +def test_rows_with_nulls_named(constructor_eager: ConstructorEager) -> None: # GIVEN - df = nw.from_native(df_raw, eager_only=True) + df = nw.from_native(constructor_eager(data_na), eager_only=True) # WHEN result = list(df.iter_rows(named=True)) diff --git a/tests/frame/sample_test.py b/tests/frame/sample_test.py index 88d5969c3..ff3591fdd 100644 --- a/tests/frame/sample_test.py +++ b/tests/frame/sample_test.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals.stable.v1 as nw -from tests.utils import Constructor + +if TYPE_CHECKING: + from tests.utils import Constructor def test_sample_n(constructor_eager: Constructor) -> None: diff --git a/tests/frame/schema_test.py b/tests/frame/schema_test.py index 97c3722a7..65da7bf00 100644 --- a/tests/frame/schema_test.py +++ b/tests/frame/schema_test.py @@ -1,7 +1,10 @@ +from __future__ import annotations + from datetime import date from datetime import datetime from datetime import timedelta from datetime import timezone +from typing import TYPE_CHECKING from typing import Any import duckdb @@ -11,8 +14,11 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version -from tests.utils import Constructor -from tests.utils import ConstructorEager + +if TYPE_CHECKING: + from tests.utils import Constructor + from tests.utils import ConstructorEager + data = { "a": [datetime(2020, 1, 1)], diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 8c01be407..df7821a5b 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pandas as pd import pytest diff --git a/tests/frame/shape_test.py b/tests/frame/shape_test.py index 6930214f7..6cbee058d 100644 --- a/tests/frame/shape_test.py +++ b/tests/frame/shape_test.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals.stable.v1 as nw -from tests.utils import ConstructorEager + +if TYPE_CHECKING: + from tests.utils import ConstructorEager def test_shape(constructor_eager: ConstructorEager) -> None: diff --git a/tests/frame/to_dict_test.py b/tests/frame/to_dict_test.py index b76003bd1..537b68f31 100644 --- a/tests/frame/to_dict_test.py +++ b/tests/frame/to_dict_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/frame/to_native_test.py b/tests/frame/to_native_test.py index c6de99a17..fb90caf10 100644 --- a/tests/frame/to_native_test.py +++ b/tests/frame/to_native_test.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals.stable.v1 as nw -from tests.utils import Constructor + +if TYPE_CHECKING: + from tests.utils import Constructor def test_to_native(constructor: Constructor) -> None: diff --git a/tests/frame/with_columns_sequence_test.py b/tests/frame/with_columns_sequence_test.py index 49db7820b..5249f0106 100644 --- a/tests/frame/with_columns_sequence_test.py +++ b/tests/frame/with_columns_sequence_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import numpy as np import pytest diff --git a/tests/frame/with_columns_test.py b/tests/frame/with_columns_test.py index 8c949cc53..722df5c01 100644 --- a/tests/frame/with_columns_test.py +++ b/tests/frame/with_columns_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import numpy as np import pandas as pd import pyarrow as pa diff --git a/tests/frame/with_row_index_test.py b/tests/frame/with_row_index_test.py index 8f802de0a..a4307acc3 100644 --- a/tests/frame/with_row_index_test.py +++ b/tests/frame/with_row_index_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import compare_dicts diff --git a/tests/from_dict_test.py b/tests/from_dict_test.py index 4583b03e5..9797713d9 100644 --- a/tests/from_dict_test.py +++ b/tests/from_dict_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals as nw diff --git a/tests/from_pycapsule_test.py b/tests/from_pycapsule_test.py index 7ab8f1fe8..496138dd2 100644 --- a/tests/from_pycapsule_test.py +++ b/tests/from_pycapsule_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import sys import pandas as pd diff --git a/tests/new_series_test.py b/tests/new_series_test.py index 37e5d2633..f5dda284d 100644 --- a/tests/new_series_test.py +++ b/tests/new_series_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pandas as pd import pytest diff --git a/tests/no_imports_test.py b/tests/no_imports_test.py index b30545380..a6fe26e31 100644 --- a/tests/no_imports_test.py +++ b/tests/no_imports_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import sys import pandas as pd diff --git a/tests/series_only/alias_rename_test.py b/tests/series_only/alias_rename_test.py index 4fa8a9993..021992735 100644 --- a/tests/series_only/alias_rename_test.py +++ b/tests/series_only/alias_rename_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals as nw from tests.utils import Constructor from tests.utils import compare_dicts diff --git a/tests/series_only/array_dunder_test.py b/tests/series_only/array_dunder_test.py index 0d95e2db3..3c30ef894 100644 --- a/tests/series_only/array_dunder_test.py +++ b/tests/series_only/array_dunder_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import numpy as np import pandas as pd import pyarrow as pa diff --git a/tests/series_only/arrow_c_stream_test.py b/tests/series_only/arrow_c_stream_test.py index 9d2ebc8d0..3417bb9a5 100644 --- a/tests/series_only/arrow_c_stream_test.py +++ b/tests/series_only/arrow_c_stream_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import polars as pl import pyarrow as pa import pyarrow.compute as pc diff --git a/tests/series_only/cast_test.py b/tests/series_only/cast_test.py index 672cbebc2..55752149b 100644 --- a/tests/series_only/cast_test.py +++ b/tests/series_only/cast_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import date from datetime import datetime diff --git a/tests/series_only/is_empty_test.py b/tests/series_only/is_empty_test.py index 390fa7f4f..bd3aa61ed 100644 --- a/tests/series_only/is_empty_test.py +++ b/tests/series_only/is_empty_test.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals.stable.v1 as nw -from tests.utils import ConstructorEager + +if TYPE_CHECKING: + from tests.utils import ConstructorEager def test_is_empty(constructor_eager: ConstructorEager) -> None: diff --git a/tests/series_only/is_ordered_categorical_test.py b/tests/series_only/is_ordered_categorical_test.py index 10251e362..58aa9616f 100644 --- a/tests/series_only/is_ordered_categorical_test.py +++ b/tests/series_only/is_ordered_categorical_test.py @@ -1,3 +1,7 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import pandas as pd import polars as pl import pyarrow as pa @@ -5,7 +9,9 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version -from tests.utils import ConstructorEager + +if TYPE_CHECKING: + from tests.utils import ConstructorEager def test_is_ordered_categorical() -> None: diff --git a/tests/series_only/shape_test.py b/tests/series_only/shape_test.py index d3e276bb2..1ab88eca3 100644 --- a/tests/series_only/shape_test.py +++ b/tests/series_only/shape_test.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals.stable.v1 as nw -from tests.utils import ConstructorEager + +if TYPE_CHECKING: + from tests.utils import ConstructorEager def test_shape(constructor_eager: ConstructorEager) -> None: diff --git a/tests/series_only/slice_test.py b/tests/series_only/slice_test.py index eba24fdbd..0744c1b77 100644 --- a/tests/series_only/slice_test.py +++ b/tests/series_only/slice_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import ConstructorEager from tests.utils import compare_dicts diff --git a/tests/series_only/to_dummy_test.py b/tests/series_only/to_dummy_test.py index 938b8d04e..52b51242e 100644 --- a/tests/series_only/to_dummy_test.py +++ b/tests/series_only/to_dummy_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/series_only/to_frame_test.py b/tests/series_only/to_frame_test.py index 065da1414..77be9a4be 100644 --- a/tests/series_only/to_frame_test.py +++ b/tests/series_only/to_frame_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals.stable.v1 as nw from tests.utils import ConstructorEager from tests.utils import compare_dicts diff --git a/tests/series_only/to_list_test.py b/tests/series_only/to_list_test.py index 0f91b9879..ebea07cff 100644 --- a/tests/series_only/to_list_test.py +++ b/tests/series_only/to_list_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest import narwhals.stable.v1 as nw diff --git a/tests/stable_api_test.py b/tests/stable_api_test.py index 7a67f5723..a076b0218 100644 --- a/tests/stable_api_test.py +++ b/tests/stable_api_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import datetime from datetime import timedelta from typing import Any diff --git a/tests/system_info_test.py b/tests/system_info_test.py index 30bb0c400..75a2b190f 100644 --- a/tests/system_info_test.py +++ b/tests/system_info_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import warnings from typing import Any diff --git a/tests/translate/from_native_test.py b/tests/translate/from_native_test.py index 8ac33b620..2d5ecd642 100644 --- a/tests/translate/from_native_test.py +++ b/tests/translate/from_native_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from contextlib import nullcontext as does_not_raise from typing import Any diff --git a/tests/translate/get_native_namespace_test.py b/tests/translate/get_native_namespace_test.py index 60b80a1d9..f02c4c8da 100644 --- a/tests/translate/get_native_namespace_test.py +++ b/tests/translate/get_native_namespace_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pandas as pd import polars as pl import pyarrow as pa diff --git a/tests/translate/to_native_test.py b/tests/translate/to_native_test.py index 90ec11ab1..3d116a459 100644 --- a/tests/translate/to_native_test.py +++ b/tests/translate/to_native_test.py @@ -1,10 +1,15 @@ +from __future__ import annotations + from contextlib import nullcontext as does_not_raise +from typing import TYPE_CHECKING from typing import Any import pytest import narwhals.stable.v1 as nw -from tests.utils import ConstructorEager + +if TYPE_CHECKING: + from tests.utils import ConstructorEager @pytest.mark.parametrize( diff --git a/tests/utils_test.py b/tests/utils_test.py index cea458bc9..30805b15d 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pandas as pd import polars as pl import pytest diff --git a/tpch/__init__.py b/tpch/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tpch/execute/__init__.py b/tpch/execute/__init__.py index e0c448649..ecbf1db53 100644 --- a/tpch/execute/__init__.py +++ b/tpch/execute/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from pathlib import Path import dask.dataframe as dd diff --git a/tpch/execute/q1.py b/tpch/execute/q1.py index 9889c3af0..d0ebce584 100644 --- a/tpch/execute/q1.py +++ b/tpch/execute/q1.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q1 from . import IO_FUNCS diff --git a/tpch/execute/q10.py b/tpch/execute/q10.py index 124bf0f7d..1f610932c 100644 --- a/tpch/execute/q10.py +++ b/tpch/execute/q10.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q10 from . import IO_FUNCS diff --git a/tpch/execute/q11.py b/tpch/execute/q11.py index 8c0a2e649..0dd8a243c 100644 --- a/tpch/execute/q11.py +++ b/tpch/execute/q11.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q11 from . import IO_FUNCS diff --git a/tpch/execute/q12.py b/tpch/execute/q12.py index 3c3a70c62..f684e22ad 100644 --- a/tpch/execute/q12.py +++ b/tpch/execute/q12.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q12 from . import IO_FUNCS diff --git a/tpch/execute/q13.py b/tpch/execute/q13.py index 2fdda5bd3..7b03a2f2f 100644 --- a/tpch/execute/q13.py +++ b/tpch/execute/q13.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q13 from . import IO_FUNCS diff --git a/tpch/execute/q14.py b/tpch/execute/q14.py index dfd54056e..a82330136 100644 --- a/tpch/execute/q14.py +++ b/tpch/execute/q14.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q14 from . import IO_FUNCS diff --git a/tpch/execute/q15.py b/tpch/execute/q15.py index 86a03b0a0..40b4432b1 100644 --- a/tpch/execute/q15.py +++ b/tpch/execute/q15.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q15 from . import IO_FUNCS diff --git a/tpch/execute/q16.py b/tpch/execute/q16.py index 6a70279d0..ef30f935c 100644 --- a/tpch/execute/q16.py +++ b/tpch/execute/q16.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q16 from . import IO_FUNCS diff --git a/tpch/execute/q17.py b/tpch/execute/q17.py index 43ef4f8b1..0b7ca4a66 100644 --- a/tpch/execute/q17.py +++ b/tpch/execute/q17.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q17 from . import IO_FUNCS diff --git a/tpch/execute/q18.py b/tpch/execute/q18.py index c7e5b7954..a096deb2f 100644 --- a/tpch/execute/q18.py +++ b/tpch/execute/q18.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q18 from . import IO_FUNCS diff --git a/tpch/execute/q19.py b/tpch/execute/q19.py index 60f91b052..23095a890 100644 --- a/tpch/execute/q19.py +++ b/tpch/execute/q19.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q19 from . import IO_FUNCS diff --git a/tpch/execute/q2.py b/tpch/execute/q2.py index cd82a9047..0e2d07019 100644 --- a/tpch/execute/q2.py +++ b/tpch/execute/q2.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q2 from . import IO_FUNCS diff --git a/tpch/execute/q20.py b/tpch/execute/q20.py index 3984b7580..c4ffa43b4 100644 --- a/tpch/execute/q20.py +++ b/tpch/execute/q20.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q20 from . import IO_FUNCS diff --git a/tpch/execute/q21.py b/tpch/execute/q21.py index 7cf772d8e..d6fb272ad 100644 --- a/tpch/execute/q21.py +++ b/tpch/execute/q21.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q21 from . import IO_FUNCS diff --git a/tpch/execute/q22.py b/tpch/execute/q22.py index a2bb1e76d..f71fc4220 100644 --- a/tpch/execute/q22.py +++ b/tpch/execute/q22.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q22 from . import IO_FUNCS diff --git a/tpch/execute/q3.py b/tpch/execute/q3.py index d6b9302cc..bbcc51d5c 100644 --- a/tpch/execute/q3.py +++ b/tpch/execute/q3.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q3 from . import IO_FUNCS diff --git a/tpch/execute/q4.py b/tpch/execute/q4.py index 5645574f8..bcfd3a158 100644 --- a/tpch/execute/q4.py +++ b/tpch/execute/q4.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q4 from . import IO_FUNCS diff --git a/tpch/execute/q5.py b/tpch/execute/q5.py index dcc61027b..66524c5a8 100644 --- a/tpch/execute/q5.py +++ b/tpch/execute/q5.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q5 from . import IO_FUNCS diff --git a/tpch/execute/q6.py b/tpch/execute/q6.py index 154964ff4..1d650b794 100644 --- a/tpch/execute/q6.py +++ b/tpch/execute/q6.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q6 from . import IO_FUNCS diff --git a/tpch/execute/q7.py b/tpch/execute/q7.py index a08d5641c..069fb258b 100644 --- a/tpch/execute/q7.py +++ b/tpch/execute/q7.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q7 from . import IO_FUNCS diff --git a/tpch/execute/q8.py b/tpch/execute/q8.py index a76a8051f..8c3aa5de9 100644 --- a/tpch/execute/q8.py +++ b/tpch/execute/q8.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q8 from . import IO_FUNCS diff --git a/tpch/execute/q9.py b/tpch/execute/q9.py index 14230af64..4c8e6874c 100644 --- a/tpch/execute/q9.py +++ b/tpch/execute/q9.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from queries import q9 from . import IO_FUNCS diff --git a/tpch/generate_data.py b/tpch/generate_data.py index 5fd73b1f7..d0a370a2a 100644 --- a/tpch/generate_data.py +++ b/tpch/generate_data.py @@ -1,4 +1,6 @@ -from pathlib import Path # noqa: INP001 +from __future__ import annotations + +from pathlib import Path import duckdb import pyarrow as pa diff --git a/tpch/queries/q1.py b/tpch/queries/q1.py index de6157702..a9c887b0a 100644 --- a/tpch/queries/q1.py +++ b/tpch/queries/q1.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q10.py b/tpch/queries/q10.py index 486e4ba82..b83d1e1b0 100644 --- a/tpch/queries/q10.py +++ b/tpch/queries/q10.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q11.py b/tpch/queries/q11.py index d5b48b359..66bacd593 100644 --- a/tpch/queries/q11.py +++ b/tpch/queries/q11.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q12.py b/tpch/queries/q12.py index ced775830..fb2a3dabe 100644 --- a/tpch/queries/q12.py +++ b/tpch/queries/q12.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q13.py b/tpch/queries/q13.py index adf57e5a2..e7499b158 100644 --- a/tpch/queries/q13.py +++ b/tpch/queries/q13.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q14.py b/tpch/queries/q14.py index f1ec6cbe3..44d176772 100644 --- a/tpch/queries/q14.py +++ b/tpch/queries/q14.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q15.py b/tpch/queries/q15.py index 1ebae57d6..f6a23dd30 100644 --- a/tpch/queries/q15.py +++ b/tpch/queries/q15.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q16.py b/tpch/queries/q16.py index d84b9aab5..f3609ae3d 100644 --- a/tpch/queries/q16.py +++ b/tpch/queries/q16.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q17.py b/tpch/queries/q17.py index 976f476f0..cf507efad 100644 --- a/tpch/queries/q17.py +++ b/tpch/queries/q17.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q18.py b/tpch/queries/q18.py index d3d183176..cdeeeca0a 100644 --- a/tpch/queries/q18.py +++ b/tpch/queries/q18.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q19.py b/tpch/queries/q19.py index bcab36e9a..63cb11dd3 100644 --- a/tpch/queries/q19.py +++ b/tpch/queries/q19.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q2.py b/tpch/queries/q2.py index 0e9e90d09..82c76bd34 100644 --- a/tpch/queries/q2.py +++ b/tpch/queries/q2.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q20.py b/tpch/queries/q20.py index b0dabb29e..0cb82e394 100644 --- a/tpch/queries/q20.py +++ b/tpch/queries/q20.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q21.py b/tpch/queries/q21.py index d10ff394f..5d6cc8784 100644 --- a/tpch/queries/q21.py +++ b/tpch/queries/q21.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q22.py b/tpch/queries/q22.py index 2e0973227..7bd76761f 100644 --- a/tpch/queries/q22.py +++ b/tpch/queries/q22.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q3.py b/tpch/queries/q3.py index 04679bccb..517d40154 100644 --- a/tpch/queries/q3.py +++ b/tpch/queries/q3.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q4.py b/tpch/queries/q4.py index a1b96be15..12a5cecd8 100644 --- a/tpch/queries/q4.py +++ b/tpch/queries/q4.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q5.py b/tpch/queries/q5.py index 2965868c9..39b402077 100644 --- a/tpch/queries/q5.py +++ b/tpch/queries/q5.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q6.py b/tpch/queries/q6.py index 67f0ac785..66b286b4b 100644 --- a/tpch/queries/q6.py +++ b/tpch/queries/q6.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q7.py b/tpch/queries/q7.py index ec0946ac3..576a1804c 100644 --- a/tpch/queries/q7.py +++ b/tpch/queries/q7.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import datetime +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q8.py b/tpch/queries/q8.py index ac3fa4baf..1ece5604b 100644 --- a/tpch/queries/q8.py +++ b/tpch/queries/q8.py @@ -1,7 +1,12 @@ +from __future__ import annotations + from datetime import date +from typing import TYPE_CHECKING import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/queries/q9.py b/tpch/queries/q9.py index 09dff4787..048538bc3 100644 --- a/tpch/queries/q9.py +++ b/tpch/queries/q9.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import narwhals as nw -from narwhals.typing import FrameT + +if TYPE_CHECKING: + from narwhals.typing import FrameT @nw.narwhalify diff --git a/tpch/tests/queries_test.py b/tpch/tests/queries_test.py index 35909b683..c228fd52b 100644 --- a/tpch/tests/queries_test.py +++ b/tpch/tests/queries_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import subprocess import sys from pathlib import Path diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index e3aa0fb91..b7d8595aa 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os import sys @@ -53,7 +55,7 @@ for i in content.splitlines() if i.startswith(" - ") ] -if missing := set(top_level_functions).difference(documented): +if missing := set(top_level_functions).difference(documented).difference({"annotations"}): print("top-level functions: not documented") # noqa: T201 print(missing) # noqa: T201 ret = 1 diff --git a/utils/check_for_no_build_errors.py b/utils/check_for_no_build_errors.py index 995411e9d..48b5a9314 100644 --- a/utils/check_for_no_build_errors.py +++ b/utils/check_for_no_build_errors.py @@ -5,6 +5,8 @@ This is just used in CI. """ +from __future__ import annotations + import sys with open("output.txt") as fd: diff --git a/utils/generate_random_versions.py b/utils/generate_random_versions.py index ecb709c1a..7ad8e044d 100644 --- a/utils/generate_random_versions.py +++ b/utils/generate_random_versions.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import random PANDAS_AND_NUMPY_VERSION = [ From 55e00bb3d0ca4e1b3e4f092344b988639230c09c Mon Sep 17 00:00:00 2001 From: Barret Schloerke Date: Thu, 17 Oct 2024 13:02:24 -0400 Subject: [PATCH 133/145] fix: Add df`.rows(named=False)` support for cuDF data frames (#1186) --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- narwhals/_arrow/dataframe.py | 40 +++++++++++++++++++++------ narwhals/_pandas_like/dataframe.py | 44 ++++++++++++++++++++++++------ narwhals/dataframe.py | 22 +++++++-------- tests/frame/rows_test.py | 29 ++++++++++++++++++++ 4 files changed, 108 insertions(+), 27 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index a0a4e16cb..6b87f1d8d 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -83,6 +83,26 @@ def __len__(self) -> int: def row(self, index: int) -> tuple[Any, ...]: return tuple(col[index] for col in self._native_frame) + @overload + def rows( + self, + *, + named: Literal[True], + ) -> list[dict[str, Any]]: ... + + @overload + def rows( + self, + *, + named: Literal[False] = False, + ) -> list[tuple[Any, ...]]: ... + @overload + def rows( + self, + *, + named: bool, + ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: ... + def rows( self, *, named: bool = False ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: @@ -141,13 +161,15 @@ def __getitem__(self, item: tuple[slice, slice]) -> ArrowDataFrame: ... def __getitem__( self, - item: str - | slice - | Sequence[int] - | Sequence[str] - | tuple[Sequence[int], str | int] - | tuple[slice, str | int] - | tuple[slice, slice], + item: ( + str + | slice + | Sequence[int] + | Sequence[str] + | tuple[Sequence[int], str | int] + | tuple[slice, str | int] + | tuple[slice, slice] + ), ) -> ArrowSeries | ArrowDataFrame: if isinstance(item, tuple): item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) @@ -496,7 +518,9 @@ def lazy(self) -> Self: def collect(self) -> ArrowDataFrame: return ArrowDataFrame( - self._native_frame, backend_version=self._backend_version, dtypes=self._dtypes + self._native_frame, + backend_version=self._backend_version, + dtypes=self._dtypes, ) def clone(self) -> Self: diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 6a6292988..8380b48db 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -141,14 +141,16 @@ def __getitem__(self, item: tuple[slice, Sequence[int]]) -> PandasLikeDataFrame: def __getitem__( self, - item: str - | int - | slice - | Sequence[int] - | Sequence[str] - | tuple[Sequence[int], str | int] - | tuple[slice | Sequence[int], Sequence[int] | slice] - | tuple[slice, slice], + item: ( + str + | int + | slice + | Sequence[int] + | Sequence[str] + | tuple[Sequence[int], str | int] + | tuple[slice | Sequence[int], Sequence[int] | slice] + | tuple[slice, slice] + ), ) -> PandasLikeSeries | PandasLikeDataFrame: if isinstance(item, tuple): item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) @@ -245,10 +247,36 @@ def __getitem__( def columns(self) -> list[str]: return self._native_frame.columns.tolist() # type: ignore[no-any-return] + @overload + def rows( + self, + *, + named: Literal[True], + ) -> list[dict[str, Any]]: ... + + @overload + def rows( + self, + *, + named: Literal[False] = False, + ) -> list[tuple[Any, ...]]: ... + + @overload + def rows( + self, + *, + named: bool, + ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: ... + def rows( self, *, named: bool = False ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: if not named: + # cuDF does not support itertuples. But it does support to_dict! + if self._implementation is Implementation.CUDF: # pragma: no cover + # Extract the row values from the named rows + return [tuple(row.values()) for row in self.rows(named=True)] + return list(self._native_frame.itertuples(index=False, name=None)) return self._native_frame.to_dict(orient="records") # type: ignore[no-any-return] diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 5c8c7e13e..ac814afc7 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -742,14 +742,16 @@ def __getitem__(self, item: tuple[slice, slice]) -> Self: ... def __getitem__( self, - item: str - | slice - | Sequence[int] - | Sequence[str] - | tuple[Sequence[int], str | int] - | tuple[slice, str | int] - | tuple[slice | Sequence[int], Sequence[int] | Sequence[str] | slice] - | tuple[slice, slice], + item: ( + str + | slice + | Sequence[int] + | Sequence[str] + | tuple[Sequence[int], str | int] + | tuple[slice, str | int] + | tuple[slice | Sequence[int], Sequence[int] | Sequence[str] | slice] + | tuple[slice, slice] + ), ) -> Series | Self: """ Extract column or slice of DataFrame. @@ -1195,16 +1197,14 @@ def columns(self) -> list[str]: def rows( self, *, - named: Literal[False], + named: Literal[False] = False, ) -> list[tuple[Any, ...]]: ... - @overload def rows( self, *, named: Literal[True], ) -> list[dict[str, Any]]: ... - @overload def rows( self, diff --git a/tests/frame/rows_test.py b/tests/frame/rows_test.py index 7e5c1ecef..106da430a 100644 --- a/tests/frame/rows_test.py +++ b/tests/frame/rows_test.py @@ -71,6 +71,35 @@ def test_rows( assert result == expected +@pytest.mark.filterwarnings( + r"ignore:.*Starting with pandas version 3\.0 all arguments of to_dict" +) +@pytest.mark.parametrize( + ("named", "expected"), + [ + (False, [(1, 4, 7.0, 5), (3, 4, 8.0, 6), (2, 6, 9.0, 7)]), + ( + True, + [ + {"a": 1, "_b": 4, "z": 7.0, "1": 5}, + {"a": 3, "_b": 4, "z": 8.0, "1": 6}, + {"a": 2, "_b": 6, "z": 9.0, "1": 7}, + ], + ), + ], +) +def test_rows_eager( + constructor_eager: Any, + named: bool, # noqa: FBT001 + expected: list[tuple[Any, ...]] | list[dict[str, Any]], +) -> None: + # posit-dev/py-shiny relies on `.rows(named=False)` to return unnamed rows + data = {"a": [1, 3, 2], "_b": [4, 4, 6], "z": [7.0, 8, 9], "1": [5, 6, 7]} + df = nw.from_native(constructor_eager(data), eager_only=True) + result = df.rows(named=named) + assert result == expected + + def test_rows_with_nulls_unnamed(constructor_eager: ConstructorEager) -> None: # GIVEN df = nw.from_native(constructor_eager(data_na), eager_only=True) From e4fe3eef4ce6fe4006004e40e8c0ac88000e88f0 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 17 Oct 2024 18:22:10 +0100 Subject: [PATCH 134/145] test: xfail iter_rows tests for cudf (#1209) --- tests/frame/rows_test.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/frame/rows_test.py b/tests/frame/rows_test.py index 106da430a..cdf426483 100644 --- a/tests/frame/rows_test.py +++ b/tests/frame/rows_test.py @@ -100,14 +100,14 @@ def test_rows_eager( assert result == expected -def test_rows_with_nulls_unnamed(constructor_eager: ConstructorEager) -> None: - # GIVEN +def test_rows_with_nulls_unnamed( + constructor_eager: ConstructorEager, request: pytest.FixtureRequest +) -> None: + if "cudf" in str(constructor_eager): + # cudf intentionally doesn't support itertuples / iter_rows + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor_eager(data_na), eager_only=True) - - # WHEN result = list(df.iter_rows(named=False)) - - # THEN expected = [(None, 4, 7.0), (3, 4, None), (2, 6, 9.0)] for i, row in enumerate(expected): for j, value in enumerate(row): @@ -118,14 +118,14 @@ def test_rows_with_nulls_unnamed(constructor_eager: ConstructorEager) -> None: assert value_in_result == value -def test_rows_with_nulls_named(constructor_eager: ConstructorEager) -> None: - # GIVEN +def test_rows_with_nulls_named( + constructor_eager: ConstructorEager, request: pytest.FixtureRequest +) -> None: + if "cudf" in str(constructor_eager): + # cudf intentionally doesn't support itertuples / iter_rows + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor_eager(data_na), eager_only=True) - - # WHEN result = list(df.iter_rows(named=True)) - - # THEN expected: list[dict[str, Any]] = [ {"a": None, "b": 4, "z": 7.0}, {"a": 3, "b": 4, "z": None}, From 1ce93de3f73512262ae4670f5f199296e1ebb0b5 Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Thu, 17 Oct 2024 22:04:56 +0200 Subject: [PATCH 135/145] docs: Add Pyarrow example to `Expr.mean` and `nw.mean` (#1207) * Add Pyarrow example to Expr.mean * Make mean docstring consistent with stable v1 API --- narwhals/expr.py | 20 ++++++++++++++++++-- narwhals/stable/v1/__init__.py | 9 ++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index 6eedbafa4..97c2e2d36 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -367,9 +367,11 @@ def mean(self) -> Self: Examples: >>> import polars as pl >>> import pandas as pd + >>> import pyarrow as pa >>> import narwhals as nw >>> df_pd = pd.DataFrame({"a": [-1, 0, 1], "b": [2, 4, 6]}) >>> df_pl = pl.DataFrame({"a": [-1, 0, 1], "b": [2, 4, 6]}) + >>> df_pa = pa.table({"a": [-1, 0, 1], "b": [2, 4, 6]}) Let's define a dataframe-agnostic function: @@ -377,7 +379,7 @@ def mean(self) -> Self: ... def func(df): ... return df.select(nw.col("a", "b").mean()) - We can then pass either pandas or Polars to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> func(df_pd) a b @@ -391,6 +393,13 @@ def mean(self) -> Self: ╞═════╪═════╡ │ 0.0 ┆ 4.0 │ └─────┴─────┘ + >>> func(df_pa) + pyarrow.Table + a: double + b: double + ---- + a: [[0]] + b: [[4]] """ return self.__class__(lambda plx: self._call(plx).mean()) @@ -4054,9 +4063,11 @@ def mean(*columns: str) -> Expr: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> df_pl = pl.DataFrame({"a": [1, 8, 3]}) >>> df_pd = pd.DataFrame({"a": [1, 8, 3]}) + >>> df_pa = pa.table({"a": [1, 8, 3]}) We define a dataframe agnostic function: @@ -4064,7 +4075,7 @@ def mean(*columns: str) -> Expr: ... def func(df): ... return df.select(nw.mean("a")) - We can then pass either pandas or Polars to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> func(df_pd) a @@ -4078,6 +4089,11 @@ def mean(*columns: str) -> Expr: ╞═════╡ │ 4.0 │ └─────┘ + >>> func(df_pa) + pyarrow.Table + a: double + ---- + a: [[4]] """ return Expr(lambda plx: plx.mean(*columns)) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 75da2a42c..7bcd6146e 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -1254,9 +1254,11 @@ def mean(*columns: str) -> Expr: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals.stable.v1 as nw >>> df_pl = pl.DataFrame({"a": [1, 8, 3]}) >>> df_pd = pd.DataFrame({"a": [1, 8, 3]}) + >>> df_pa = pa.table({"a": [1, 8, 3]}) We define a dataframe agnostic function: @@ -1264,7 +1266,7 @@ def mean(*columns: str) -> Expr: ... def func(df): ... return df.select(nw.mean("a")) - We can then pass either pandas or Polars to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> func(df_pd) a @@ -1278,6 +1280,11 @@ def mean(*columns: str) -> Expr: ╞═════╡ │ 4.0 │ └─────┘ + >>> func(df_pa) + pyarrow.Table + a: double + ---- + a: [[4]] """ return _stableify(nw.mean(*columns)) From 741e66d34982739b019935d684881c61b1d48648 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 17 Oct 2024 22:06:06 +0200 Subject: [PATCH 136/145] [pre-commit.ci] pre-commit autoupdate (#1178) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/adamchainz/blacken-docs: 1.18.0 → 1.19.0](https://github.com/adamchainz/blacken-docs/compare/1.18.0...1.19.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 89d05e542..4d416e237 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: hooks: - id: nbstripout - repo: https://github.com/adamchainz/blacken-docs - rev: "1.18.0" # replace with latest tag on GitHub + rev: "1.19.0" # replace with latest tag on GitHub hooks: - id: blacken-docs args: [--skip-errors] From 736a3606477a875e7f18f81d756efa809d98b519 Mon Sep 17 00:00:00 2001 From: Zhengbo Wang Date: Fri, 18 Oct 2024 04:07:18 +0800 Subject: [PATCH 137/145] docs: Add uv to quick start (#1169) * doc: Add uv to quick start * Update docs/installation.md Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> * docs: update with content tab * docs: enhance installation instructions for UV and Python's venv * Update docs/installation.md Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- docs/installation.md | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 974eaf1f4..668126715 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -2,20 +2,36 @@ ## Installation -First, make sure you have [created and activated](https://docs.python.org/3/library/venv.html) a Python3.8+ virtual environment. +=== "UV" -Then, run -```console -python -m pip install narwhals -``` + First, ensure you have installed [UV](https://github.com/astral-sh/uv), and make sure you have [created and activated](https://docs.astral.sh/uv/pip/environments/#python-environments) a Python 3.8+ virtual environment. + + If you haven't, you can follow our [_setting up your environment_](https://github.com/narwhals-dev/narwhals/blob/main/CONTRIBUTING.md#option-1-use-uv-recommended) guide. + Then, run: + + ```console + uv pip install narwhals + ``` + +=== "Python's venv" + + First, ensure you have [created and activated](https://docs.python.org/3/library/venv.html) a Python 3.8+ virtual environment. + + Then, run: + + ```console + python -m pip install narwhals + ``` + +### Verifying the Installation -Then, if you start the Python REPL and see the following: +To verify the installation, start the Python REPL and execute: ```python >>> import narwhals >>> narwhals.__version__ '1.9.4' ``` -then installation worked correctly! +If you see the version number, then the installation was successful! ## Quick start From 05f9e061c2d78d5ad4cc69f1a5366bab6c3ac682 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Fri, 18 Oct 2024 02:57:12 -0400 Subject: [PATCH 138/145] docs: move index to start of API reference (#1213) --- mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 75e2b0fd5..3793d898a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -26,6 +26,7 @@ nav: - Supported Expr methods: api-completeness/expr.md - Supported Series methods: api-completeness/series.md - API Reference: + - api-reference/index.md - api-reference/narwhals.md - api-reference/dataframe.md - api-reference/expr.md @@ -42,7 +43,6 @@ nav: - api-reference/series_str.md - api-reference/dependencies.md - api-reference/dtypes.md - - api-reference/index.md - api-reference/selectors.md - api-reference/typing.md theme: From 90836bd7bc428d610cab8a65316aaedee3febe53 Mon Sep 17 00:00:00 2001 From: Myles Scolnick Date: Fri, 18 Oct 2024 03:30:00 -0400 Subject: [PATCH 139/145] simplify marimo in downstream_tests.yml (#1214) --- .github/workflows/downstream_tests.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml index 542dacbbf..4f9cbe06d 100644 --- a/.github/workflows/downstream_tests.yml +++ b/.github/workflows/downstream_tests.yml @@ -87,21 +87,16 @@ jobs: - name: show-deps run: uv pip freeze - name: Create assets directory, copy over index.html + continue-on-error: true run: | mkdir -p marimo/marimo/_static/assets cp marimo/frontend/index.html marimo/marimo/_static/index.html cp marimo/frontend/public/favicon.ico marimo/marimo/_static/favicon.ico - - name: Run tests with minimal dependencies - if: ${{ matrix.dependencies == 'core' }} - run: | - cd marimo - hatch run +py=${{ matrix.python-version }} test:test -v tests/ -k "not test_cli" - timeout-minutes: 15 - - name: Run tests with optional dependencies + - name: Run tests with full dependencies if: ${{ matrix.dependencies == 'core,optional' }} run: | cd marimo - hatch run +py=${{ matrix.python-version }} test-optional:test -v tests/ -k "not test_cli" + hatch run +py=${{ matrix.python-version }} test-optional:test-narwhals timeout-minutes: 15 - name: Run typechecks run: | From 8e5cc6686793ae86dea7aef755b07ca1765aafd9 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:23:25 +0100 Subject: [PATCH 140/145] fix pyarrow to_date (#1216) --- narwhals/_arrow/series.py | 2 +- tests/expr_and_series/dt/datetime_attributes_test.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 65a393ca9..2eb738291 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -784,7 +784,7 @@ def date(self: Self) -> ArrowSeries: import pyarrow as pa # ignore-banned-import() return self._arrow_series._from_native_series( - self._arrow_series._native_series.cast(pa.date64()) + self._arrow_series._native_series.cast(pa.date32()) ) def year(self: Self) -> ArrowSeries: diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index 757d226ff..017daace6 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -104,3 +104,15 @@ def test_datetime_chained_attributes( result = df.select(nw.col("a").dt.date().dt.year()) compare_dicts(result, {"a": [2021, 2020]}) + + +def test_to_date(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if any( + x in str(constructor) + for x in ("pandas_constructor", "pandas_nullable_constructor", "dask") + ): + request.applymarker(pytest.mark.xfail) + dates = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} + df = nw.from_native(constructor(dates)) + result = df.select(nw.col("a").dt.date()) + assert result.collect_schema() == {"a": nw.Date} From 876303d8aa029d5f29b2e6091ef455d6370eaf12 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Fri, 18 Oct 2024 16:09:26 +0200 Subject: [PATCH 141/145] move maybe_evaluate_expr out of for loop (#1217) --- narwhals/_expression_parsing.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index d281cc945..89d020b0b 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -219,19 +219,21 @@ def reuse_series_implementation( plx = expr.__narwhals_namespace__() def func(df: CompliantDataFrame) -> list[CompliantSeries]: - out: list[CompliantSeries] = [] - for column in expr._call(df): # type: ignore[arg-type] - _out = getattr(column, attr)( - *[maybe_evaluate_expr(df, arg) for arg in args], - **{ - arg_name: maybe_evaluate_expr(df, arg_value) - for arg_name, arg_value in kwargs.items() - }, + _args = [maybe_evaluate_expr(df, arg) for arg in args] + _kwargs = { + arg_name: maybe_evaluate_expr(df, arg_value) + for arg_name, arg_value in kwargs.items() + } + + out: list[CompliantSeries] = [ + plx._create_series_from_scalar( + getattr(column, attr)(*_args, **_kwargs), + column, # type: ignore[arg-type] ) - if returns_scalar: - out.append(plx._create_series_from_scalar(_out, column)) # type: ignore[arg-type] - else: - out.append(_out) + if returns_scalar + else getattr(column, attr)(*_args, **_kwargs) + for column in expr._call(df) # type: ignore[arg-type] + ] if expr._output_names is not None and ( [s.name for s in out] != expr._output_names ): # pragma: no cover From feff1f170b62e77330a2d96cad3a581e35691e10 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Fri, 18 Oct 2024 16:17:21 +0200 Subject: [PATCH 142/145] chore: use constructors (#1210) --- tests/expr_and_series/str/contains_test.py | 5 ----- tests/series_only/cast_test.py | 19 +++++++++++-------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/tests/expr_and_series/str/contains_test.py b/tests/expr_and_series/str/contains_test.py index 2c2e0cb9f..98e8ceaa3 100644 --- a/tests/expr_and_series/str/contains_test.py +++ b/tests/expr_and_series/str/contains_test.py @@ -1,7 +1,5 @@ from __future__ import annotations -import pandas as pd -import polars as pl import pytest import narwhals.stable.v1 as nw @@ -11,9 +9,6 @@ data = {"pets": ["cat", "dog", "rabbit and parrot", "dove"]} -df_pandas = pd.DataFrame(data) -df_polars = pl.DataFrame(data) - def test_contains_case_insensitive( constructor: Constructor, request: pytest.FixtureRequest diff --git a/tests/series_only/cast_test.py b/tests/series_only/cast_test.py index 55752149b..c5b37b8c0 100644 --- a/tests/series_only/cast_test.py +++ b/tests/series_only/cast_test.py @@ -2,6 +2,7 @@ from datetime import date from datetime import datetime +from typing import TYPE_CHECKING import pandas as pd import polars as pl @@ -12,16 +13,18 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +if TYPE_CHECKING: + from tests.utils import ConstructorEager -def test_cast_253() -> None: - df_polars = pl.DataFrame({"a": [1]}) - result = nw.from_native(df_polars, eager_only=True).select( - nw.col("a").cast(nw.String) + "hi" - )["a"][0] - assert result == "1hi" - df_pandas = pd.DataFrame({"a": [1]}) - result = nw.from_native(df_pandas, eager_only=True).select( +def test_cast_253( + request: pytest.FixtureRequest, constructor_eager: ConstructorEager +) -> None: + if "pyarrow_table" in str(constructor_eager): + request.applymarker(pytest.mark.xfail) + + df_raw = constructor_eager({"a": [1]}) + result = nw.from_native(df_raw, eager_only=True).select( nw.col("a").cast(nw.String) + "hi" )["a"][0] assert result == "1hi" From 73829c05e1917b02496b1f4e68a4f0de903bb31e Mon Sep 17 00:00:00 2001 From: artiom-matvei <46079515+artiom-matvei@users.noreply.github.com> Date: Fri, 18 Oct 2024 17:17:46 -0400 Subject: [PATCH 143/145] docs: update installation.md (#1221) --- docs/installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/installation.md b/docs/installation.md index 668126715..1695a7eec 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -85,4 +85,4 @@ If you run `python t.py` then your output should look like the above. This is th function - as we'll soon see, we can do much more advanced things. Let's learn about what you just did, and what Narwhals can do for you! -Note: these examples are only using pandas and Polars. Please see the following to find the [supported libraries](extending.md). +Note: these examples are only using pandas, Polars and PyArrow. Please see the following to find the [supported libraries](extending.md). From 59aa483931664ae579c54f1481ea59d4e550d5c5 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sat, 19 Oct 2024 09:34:47 +0200 Subject: [PATCH 144/145] chore: nox doctests only in Python 3.12 (#1222) * only doctest in 3.12 in nox * skip writing file in doctests --- narwhals/dataframe.py | 6 +++--- noxfile.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index ac814afc7..4645cfbb4 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -582,9 +582,9 @@ def write_parquet(self, file: str | Path | BytesIO) -> Any: We can then pass either pandas, Polars or PyArrow to `func`: - >>> func(df_pd) - >>> func(df_pl) - >>> func(df_pa) + >>> func(df_pd) # doctest:+SKIP + >>> func(df_pl) # doctest:+SKIP + >>> func(df_pa) # doctest:+SKIP """ self._compliant_frame.write_parquet(file) diff --git a/noxfile.py b/noxfile.py index 1fb820c65..aec70add4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -27,7 +27,9 @@ def run_common(session: Session, coverage_threshold: float) -> None: f"--cov-fail-under={coverage_threshold}", "--runslow", ) - session.run("pytest", "narwhals", "--doctest-modules") + + if session.python == "3.12": + session.run("pytest", "narwhals", "--doctest-modules") @nox.session(python=PYTHON_VERSIONS) # type: ignore[misc] From 0c1650c07fa7d77602561d384c1cf6e8902fe75d Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sat, 19 Oct 2024 09:42:47 +0200 Subject: [PATCH 145/145] feat: add to_py_scalar (#1194) * add to_py_scalar * fix tests * more fixes pragma and doctsting * fix test_to_py_scalar_cudf_series * convert numpy scalars * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove parse version * simplify test_to_py_scalar_arrays_series * add conversion for datetime and timedelta * stricter to_py_scalar --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/api-reference/narwhals.md | 1 + narwhals/__init__.py | 2 + narwhals/dependencies.py | 5 +++ narwhals/stable/v1/__init__.py | 24 ++++++++++ narwhals/translate.py | 67 ++++++++++++++++++++++++++++ tests/translate/to_py_scalar_test.py | 63 ++++++++++++++++++++++++++ 6 files changed, 162 insertions(+) create mode 100644 tests/translate/to_py_scalar_test.py diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index b8ec2d793..c4b04a2f4 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -39,4 +39,5 @@ Here are the top-level functions available in Narwhals. - when - show_versions - to_native + - to_py_scalar show_source: false diff --git a/narwhals/__init__.py b/narwhals/__init__.py index aeba3ef5e..8dd76d081 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -58,6 +58,7 @@ from narwhals.translate import get_native_namespace from narwhals.translate import narwhalify from narwhals.translate import to_native +from narwhals.translate import to_py_scalar from narwhals.utils import is_ordered_categorical from narwhals.utils import maybe_align_index from narwhals.utils import maybe_convert_dtypes @@ -84,6 +85,7 @@ "maybe_reset_index", "maybe_set_index", "get_native_namespace", + "to_py_scalar", "all", "all_horizontal", "any_horizontal", diff --git a/narwhals/dependencies.py b/narwhals/dependencies.py index 144c57c8a..1f9ae19f5 100644 --- a/narwhals/dependencies.py +++ b/narwhals/dependencies.py @@ -46,6 +46,11 @@ def get_cudf() -> Any: return sys.modules.get("cudf", None) +def get_cupy() -> Any: + """Get cupy module (if already imported - else return None).""" + return sys.modules.get("cupy", None) + + def get_pyarrow() -> Any: # pragma: no cover """Get pyarrow module (if already imported - else return None).""" return sys.modules.get("pyarrow", None) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 7bcd6146e..c09b0f2b3 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -51,6 +51,7 @@ from narwhals.translate import _from_native_impl from narwhals.translate import get_native_namespace as nw_get_native_namespace from narwhals.translate import to_native +from narwhals.translate import to_py_scalar as nw_to_py_scalar from narwhals.typing import IntoDataFrameT from narwhals.typing import IntoFrameT from narwhals.typing import IntoSeriesT @@ -952,6 +953,28 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: return decorator(func) +def to_py_scalar(scalar: Any) -> Any: + """If a scalar is not Python native, converts it to Python native. + + Raises: + ValueError: If the object is not convertible to a scalar. + + Examples: + >>> import narwhals.stable.v1 as nw + >>> import pandas as pd + >>> df = nw.from_native(pd.DataFrame({"a": [1, 2, 3]})) + >>> nw.to_py_scalar(df["a"].item(0)) + 1 + >>> import pyarrow as pa + >>> df = nw.from_native(pa.table({"a": [1, 2, 3]})) + >>> nw.to_py_scalar(df["a"].item(0)) + 1 + >>> nw.to_py_scalar(1) + 1 + """ + return _stableify(nw_to_py_scalar(scalar)) + + def all() -> Expr: """ Instantiate an expression representing all columns. @@ -2306,6 +2329,7 @@ def from_dict( "dependencies", "to_native", "from_native", + "to_py_scalar", "is_ordered_categorical", "maybe_align_index", "maybe_convert_dtypes", diff --git a/narwhals/translate.py b/narwhals/translate.py index 0dc0cd467..331b87d88 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -1,5 +1,8 @@ from __future__ import annotations +import numbers +from datetime import datetime +from datetime import timedelta from functools import wraps from typing import TYPE_CHECKING from typing import Any @@ -9,9 +12,11 @@ from typing import overload from narwhals.dependencies import get_cudf +from narwhals.dependencies import get_cupy from narwhals.dependencies import get_dask from narwhals.dependencies import get_dask_expr from narwhals.dependencies import get_modin +from narwhals.dependencies import get_numpy from narwhals.dependencies import get_pandas from narwhals.dependencies import get_polars from narwhals.dependencies import get_pyarrow @@ -776,8 +781,70 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: return decorator(func) +def to_py_scalar(scalar_like: Any) -> Any: + """If a scalar is not Python native, converts it to Python native. + + Raises: + ValueError: If the object is not convertible to a scalar. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> df = nw.from_native(pd.DataFrame({"a": [1, 2, 3]})) + >>> nw.to_py_scalar(df["a"].item(0)) + 1 + >>> import pyarrow as pa + >>> df = nw.from_native(pa.table({"a": [1, 2, 3]})) + >>> nw.to_py_scalar(df["a"].item(0)) + 1 + >>> nw.to_py_scalar(1) + 1 + """ + + pa = get_pyarrow() + if pa and isinstance(scalar_like, pa.Scalar): + return scalar_like.as_py() + + cupy = get_cupy() + if ( # pragma: no cover + cupy and isinstance(scalar_like, cupy.ndarray) and scalar_like.size == 1 + ): + return scalar_like.item() + + np = get_numpy() + if np and np.isscalar(scalar_like) and hasattr(scalar_like, "item"): + return scalar_like.item() + + pd = get_pandas() + if pd and isinstance(scalar_like, pd.Timestamp): + return scalar_like.to_pydatetime() + if pd and isinstance(scalar_like, pd.Timedelta): + return scalar_like.to_pytimedelta() + + all_scalar_types = ( + int, + float, + complex, + bool, + bytes, + str, + datetime, + timedelta, + numbers.Number, + ) + if isinstance(scalar_like, all_scalar_types): + return scalar_like + + msg = ( + f"Expected object convertible to a scalar, found {type(scalar_like)}. " + "Please report a bug to https://github.com/narwhals-dev/narwhals/issues" + ) + raise ValueError(msg) + + __all__ = [ "get_native_namespace", "to_native", "narwhalify", + "to_py_scalar", ] diff --git a/tests/translate/to_py_scalar_test.py b/tests/translate/to_py_scalar_test.py new file mode 100644 index 000000000..c9aa2749d --- /dev/null +++ b/tests/translate/to_py_scalar_test.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from datetime import datetime +from datetime import timedelta +from typing import TYPE_CHECKING +from typing import Any + +import numpy as np +import pandas as pd +import pytest + +import narwhals.stable.v1 as nw +from narwhals.dependencies import get_cudf + +if TYPE_CHECKING: + from tests.utils import ConstructorEager + + +@pytest.mark.parametrize( + ("input_value", "expected"), + [ + (1, 1), + (1.0, 1.0), + ("a", "a"), + (True, True), + (b"a", b"a"), + (datetime(2021, 1, 1), datetime(2021, 1, 1)), + (timedelta(days=1), timedelta(days=1)), + ], +) +def test_to_py_scalar( + constructor_eager: ConstructorEager, input_value: Any, expected: Any +) -> None: + df = nw.from_native(constructor_eager({"a": [input_value]})) + output = nw.to_py_scalar(df["a"].item(0)) + if expected == 1 and constructor_eager.__name__.startswith("pandas"): + assert not isinstance(output, np.int64) + elif isinstance(expected, datetime) and constructor_eager.__name__.startswith( + "pandas" + ): + assert not isinstance(output, pd.Timestamp) + elif isinstance(expected, timedelta) and constructor_eager.__name__.startswith( + "pandas" + ): + assert not isinstance(output, pd.Timedelta) + assert output == expected + + +@pytest.mark.parametrize( + "input_value", + [np.array([1, 2]), [1, 2, 3], {"a": [1, 2, 3]}], +) +def test_to_py_scalar_value_error(input_value: Any) -> None: + with pytest.raises(ValueError, match="Expected object convertible to a scalar"): + nw.to_py_scalar(input_value) + + +def test_to_py_scalar_value_error_cudf() -> None: + if cudf := get_cudf(): # pragma: no cover + df = nw.from_native(cudf.DataFrame({"a": [1, 2, 3]})) + + with pytest.raises(ValueError, match="Expected object convertible to a scalar"): + nw.to_py_scalar(df["a"])