Merge branch 'main' into feat/pyarrow-to-datetime-infer

narwhals-dev · Oct 17, 2024 · bc5f854 · bc5f854
2 parents 3a26b96 + e980483
commit bc5f854
Show file tree

Hide file tree

Showing 129 changed files with 885 additions and 430 deletions.
diff --git a/.github/workflows/bump-version.yml b/.github/workflows/bump-version.yml
@@ -33,7 +33,7 @@ jobs:
         python utils/bump_version.py ${{ github.event.inputs.release_type }}
 
     - name: Create pull request
-      uses: actions/github-script@v6
+      uses: actions/github-script@v7
       if: github.actor == 'MarcoGorelli' || github.actor == 'FBruzzesi'
       with:
         script: |

diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml
@@ -59,8 +59,6 @@ jobs:
         run: uv pip freeze
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow
-      - name: Run doctests
-        run: pytest narwhals --doctest-modules
 
   not_so_old_versions:
     strategy:
@@ -88,13 +86,11 @@ jobs:
         run: uv pip freeze
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow
-      - name: Run doctests
-        run: pytest narwhals --doctest-modules
 
   nightlies:
     strategy:
       matrix:
-        python-version: ["3.11"]
+        python-version: ["3.12"]
         os: [ubuntu-latest]
     if: github.event.pull_request.head.repo.full_name == github.repository
     runs-on: ${{ matrix.os }}

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -30,9 +30,6 @@ jobs:
         run: uv pip freeze
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=85
-      - name: Run doctests
-        if: startsWith(matrix.os, 'windows') != true
-        run: pytest narwhals --doctest-modules
 
   pytest-windows:
     strategy:
@@ -60,8 +57,6 @@ jobs:
         run: uv pip freeze
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --runslow --cov-fail-under=95
-      - name: Run doctests
-        run: pytest narwhals --doctest-modules
 
   pytest-coverage:
     strategy:
@@ -95,4 +90,5 @@ jobs:
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=100 --runslow
       - name: Run doctests
+        if: matrix.python-version == '3.12'
         run: pytest narwhals --doctest-modules
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -51,17 +51,20 @@ Here's how you can set up your local development environment to contribute.
 
 #### Option 1: Use UV (recommended)
 
-1. Make sure you have Python3.8+ installed (for example, Python 3.11), create a virtual environment,
+1. Make sure you have Python3.12 installed, create a virtual environment,
    and activate it. If you're new to this, here's one way that we recommend:
    1. Install uv: https://github.com/astral-sh/uv?tab=readme-ov-file#getting-started
-   2. Install some version of Python greater than Python3.8. For example, to install
-      Python3.11:
+      or make sure it is up-to-date with:
       ```
-      uv python install 3.11
+      uv self update
+      ```
+   2. Install Python3.12:
+      ```
+      uv python install 3.12
       ```
    3. Create a virtual environment:
       ```
-      uv venv -p 3.11 --seed
+      uv venv -p 3.12 --seed
       ```
    4. Activate it. On Linux, this is `. .venv/bin/activate`, on Windows `.\.venv\Scripts\activate`.
 2. Install Narwhals: `uv pip install -e .`

diff --git a/docs/api-reference/dtypes.md b/docs/api-reference/dtypes.md
@@ -6,7 +6,6 @@
       members:
         - Array
         - List
-        - Struct
         - Int64
         - Int32
         - Int16
@@ -15,12 +14,14 @@
         - UInt32
         - UInt16
         - UInt8
+        - Field
         - Float64
         - Float32
         - Boolean
         - Categorical
         - Enum
         - String
+        - Struct
         - Date
         - Datetime
         - Duration

diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md
@@ -14,6 +14,7 @@ Here are the top-level functions available in Narwhals.
         - concat_str
         - from_dict
         - from_native
+        - from_arrow
         - get_level
         - get_native_namespace
         - is_ordered_categorical

diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -10,6 +10,7 @@
 from narwhals.dtypes import Datetime
 from narwhals.dtypes import Duration
 from narwhals.dtypes import Enum
+from narwhals.dtypes import Field
 from narwhals.dtypes import Float32
 from narwhals.dtypes import Float64
 from narwhals.dtypes import Int8
@@ -44,6 +45,7 @@
 from narwhals.expr import sum_horizontal
 from narwhals.expr import when
 from narwhals.functions import concat
+from narwhals.functions import from_arrow
 from narwhals.functions import from_dict
 from narwhals.functions import get_level
 from narwhals.functions import new_series
@@ -68,6 +70,7 @@
     "selectors",
     "concat",
     "from_dict",
+    "from_arrow",
     "get_level",
     "new_series",
     "to_native",
@@ -118,6 +121,7 @@
     "String",
     "Datetime",
     "Duration",
+    "Field",
     "Struct",
     "Array",
     "List",

diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py
@@ -56,7 +56,16 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType:
     if pa.types.is_dictionary(dtype):
         return dtypes.Categorical()
     if pa.types.is_struct(dtype):
-        return dtypes.Struct()
+        return dtypes.Struct(
+            [
+                dtypes.Field(
+                    dtype.field(i).name,
+                    native_to_narwhals_dtype(dtype.field(i).type, dtypes),
+                )
+                for i in range(dtype.num_fields)
+            ]
+        )
+
     if pa.types.is_list(dtype) or pa.types.is_large_list(dtype):
         return dtypes.List(native_to_narwhals_dtype(dtype.value_type, dtypes))
     if pa.types.is_fixed_size_list(dtype):

diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py
@@ -52,7 +52,16 @@ def map_duckdb_dtype_to_narwhals_dtype(duckdb_dtype: Any, dtypes: DTypes) -> DTy
     if duckdb_dtype == "INTERVAL":
         return dtypes.Duration()
     if duckdb_dtype.startswith("STRUCT"):
-        return dtypes.Struct()
+        matchstruc_ = re.findall(r"(\w+)\s+(\w+)", duckdb_dtype)
+        return dtypes.Struct(
+            [
+                dtypes.Field(
+                    matchstruc_[i][0],
+                    map_duckdb_dtype_to_narwhals_dtype(matchstruc_[i][1], dtypes),
+                )
+                for i in range(len(matchstruc_))
+            ]
+        )
     if match_ := re.match(r"(.*)\[\]$", duckdb_dtype):
         return dtypes.List(map_duckdb_dtype_to_narwhals_dtype(match_.group(1), dtypes))
     if match_ := re.match(r"(\w+)\[(\d+)\]", duckdb_dtype):

diff --git a/narwhals/_ibis/dataframe.py b/narwhals/_ibis/dataframe.py
@@ -51,7 +51,15 @@ def map_ibis_dtype_to_narwhals_dtype(ibis_dtype: Any, dtypes: DTypes) -> DType:
             map_ibis_dtype_to_narwhals_dtype(ibis_dtype.value_type, dtypes)
         )
     if ibis_dtype.is_struct():
-        return dtypes.Struct()
+        return dtypes.Struct(
+            [
+                dtypes.Field(
+                    ibis_dtype_name,
+                    map_ibis_dtype_to_narwhals_dtype(ibis_dtype_field, dtypes),
+                )
+                for ibis_dtype_name, ibis_dtype_field in ibis_dtype.items()
+            ]
+        )
     return dtypes.Unknown()  # pragma: no cover
 
 

diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
@@ -619,9 +619,7 @@ def quantile(
 
     def zip_with(self: Self, mask: Any, other: Any) -> PandasLikeSeries:
         ser = self._native_series
-        mask = validate_column_comparand(
-            ser.index, mask, treat_length_one_as_scalar=False
-        )
+        mask = validate_column_comparand(ser.index, mask)
         other = validate_column_comparand(ser.index, other)
         res = ser.where(mask, other)
         return self._from_native_series(res)

diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py
@@ -32,9 +32,7 @@
 }
 
 
-def validate_column_comparand(
-    index: Any, other: Any, *, treat_length_one_as_scalar: bool = True
-) -> Any:
+def validate_column_comparand(index: Any, other: Any) -> Any:
     """Validate RHS of binary operation.
 
     If the comparison isn't supported, return `NotImplemented` so that the
@@ -55,9 +53,10 @@ def validate_column_comparand(
     if isinstance(other, PandasLikeDataFrame):
         return NotImplemented
     if isinstance(other, PandasLikeSeries):
-        if other.len() == 1 and treat_length_one_as_scalar:
+        if other.len() == 1:
             # broadcast
-            return other.item()
+            s = other._native_series
+            return s.__class__(s.iloc[0], index=index, dtype=s.dtype)
         if other._native_series.index is not index:
             return set_axis(
                 other._native_series,
@@ -83,7 +82,8 @@ def validate_dataframe_comparand(index: Any, other: Any) -> Any:
     if isinstance(other, PandasLikeSeries):
         if other.len() == 1:
             # broadcast
-            return other._native_series.iloc[0]
+            s = other._native_series
+            return s.__class__(s.iloc[0], index=index, dtype=s.dtype)
         if other._native_series.index is not index:
             return set_axis(
                 other._native_series,
@@ -294,7 +294,7 @@ def native_to_narwhals_dtype(native_column: Any, dtypes: DTypes) -> DType:
             native_column.dtype.pyarrow_dtype.list_size,
         )
     if dtype.startswith("struct"):
-        return dtypes.Struct()
+        return arrow_native_to_narwhals_dtype(native_column.dtype.pyarrow_dtype, dtypes)
     if dtype == "object":
         if (  # pragma: no cover  TODO(unassigned): why does this show as uncovered?
             idx := getattr(native_column, "first_valid_index", lambda: None)()

diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py
@@ -75,7 +75,12 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType:
         du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
         return dtypes.Duration(time_unit=du_time_unit)
     if dtype == pl.Struct:
-        return dtypes.Struct()
+        return dtypes.Struct(
+            [
+                dtypes.Field(field_name, native_to_narwhals_dtype(field_type, dtypes))
+                for field_name, field_type in dtype
+            ]
+        )
     if dtype == pl.List:
         return dtypes.List(native_to_narwhals_dtype(dtype.inner, dtypes))
     if dtype == pl.Array:

diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py
@@ -546,12 +546,12 @@ def write_csv(self, file: str | Path | BytesIO | None = None) -> Any:
 
             We can pass any supported library such as pandas, Polars or PyArrow to `func`:
 
-            >>> func(df_pd)  # doctest: +SKIP
-            'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n'
-            >>> func(df_pl)  # doctest: +SKIP
+            >>> func(df_pd)
             'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n'
-            >>> func(df_pa)  # doctest: +SKIP
+            >>> func(df_pl)
             'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n'
+            >>> func(df_pa)
+            '"foo","bar","ham"\n1,6,"a"\n2,7,"b"\n3,8,"c"\n'
 
             If we had passed a file name to `write_csv`, it would have been
             written to that file.
@@ -582,9 +582,9 @@ def write_parquet(self, file: str | Path | BytesIO) -> Any:
 
             We can then pass either pandas, Polars or PyArrow to `func`:
 
-            >>> func(df_pd)  # doctest:+SKIP
-            >>> func(df_pl)  # doctest:+SKIP
-            >>> func(df_pa)  # doctest:+SKIP
+            >>> func(df_pd)
+            >>> func(df_pl)
+            >>> func(df_pa)
         """
         self._compliant_frame.write_parquet(file)
 
@@ -1116,12 +1116,12 @@ def schema(self) -> Schema:
             You can pass either pandas or Polars to `func`:
 
             >>> df_pd_schema = func(df_pd)
-            >>> df_pd_schema  # doctest:+SKIP
-            Schema({'foo': Int64, 'bar': Float64, 'ham', String})
+            >>> df_pd_schema
+            Schema({'foo': Int64, 'bar': Float64, 'ham': String})
 
             >>> df_pl_schema = func(df_pl)
-            >>> df_pl_schema  # doctest:+SKIP
-            Schema({'foo': Int64, 'bar': Float64, 'ham', String})
+            >>> df_pl_schema
+            Schema({'foo': Int64, 'bar': Float64, 'ham': String})
         """
         return super().schema
 
@@ -1150,12 +1150,12 @@ def collect_schema(self: Self) -> Schema:
             You can pass either pandas or Polars to `func`:
 
             >>> df_pd_schema = func(df_pd)
-            >>> df_pd_schema  # doctest:+SKIP
-            Schema({'foo': Int64, 'bar': Float64, 'ham', String})
+            >>> df_pd_schema
+            Schema({'foo': Int64, 'bar': Float64, 'ham': String})
 
             >>> df_pl_schema = func(df_pl)
-            >>> df_pl_schema  # doctest:+SKIP
-            Schema({'foo': Int64, 'bar': Float64, 'ham', String})
+            >>> df_pl_schema
+            Schema({'foo': Int64, 'bar': Float64, 'ham': String})
         """
         return super().collect_schema()
 
@@ -2478,8 +2478,8 @@ def item(self: Self, row: int | None = None, column: int | str | None = None) ->
 
             We can then pass either pandas or Polars to `func`:
 
-            >>> func(df_pd, 1, 1), func(df_pd, 2, "b")  # doctest:+SKIP
-            (5, 6)
+            >>> func(df_pd, 1, 1), func(df_pd, 2, "b")
+            (np.int64(5), np.int64(6))
 
             >>> func(df_pl, 1, 1), func(df_pl, 2, "b")
             (5, 6)
@@ -2581,7 +2581,7 @@ def to_arrow(self: Self) -> pa.Table:
             ... def func(df):
             ...     return df.to_arrow()
 
-            >>> func(df_pd)  # doctest:+SKIP
+            >>> func(df_pd)
             pyarrow.Table
             foo: int64
             bar: string
@@ -3010,7 +3010,7 @@ def schema(self) -> Schema:
             ...     }
             ... )
             >>> lf = nw.from_native(lf_pl)
-            >>> lf.schema  # doctest:+SKIP
+            >>> lf.schema  # doctest: +SKIP
             Schema({'foo': Int64, 'bar': Float64, 'ham', String})
         """
         return super().schema
@@ -3030,8 +3030,8 @@ def collect_schema(self: Self) -> Schema:
             ...     }
             ... )
             >>> lf = nw.from_native(lf_pl)
-            >>> lf.collect_schema()  # doctest:+SKIP
-            Schema({'foo': Int64, 'bar': Float64, 'ham', String})
+            >>> lf.collect_schema()
+            Schema({'foo': Int64, 'bar': Float64, 'ham': String})
         """
         return super().collect_schema()
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,7 +6,6 @@ @@
           members:
             - Array
             - List
-            - Struct
             - Int64
             - Int32
             - Int16
@@ Expand All / @@ -15,12 +14,14 @@ @@
             - UInt32
             - UInt16
             - UInt8
+            - Field
             - Float64
             - Float32
             - Boolean
             - Categorical
             - Enum
             - String
+            - Struct
             - Date
             - Datetime
             - Duration
@@ Expand Down @@