From 876353581cec0dceecb025a3a7d9d99496338b31 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 2 Nov 2024 13:28:24 +0000 Subject: [PATCH] fix: fixed-offset datetime weren't being parsed (#1303) * fix: fix parsing of fixed-offset timezones * vresions --- narwhals/_pandas_like/utils.py | 61 +++++++++++++++++++++++++++------- tests/dtypes_test.py | 21 ++++++++++++ 2 files changed, 70 insertions(+), 12 deletions(-) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 99181bc1e..8074413d7 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -30,6 +30,51 @@ Implementation.CUDF, Implementation.MODIN, } +PD_DATETIME_RGX = r"""^ + datetime64\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + (?:, # Begin non-capturing group for optional timezone + \s* # Optional whitespace after comma + (?P # Start named group for timezone + [a-zA-Z\/]+ # Match timezone name, e.g., UTC, America/New_York + (?:[+-]\d{2}:\d{2})? # Optional offset in format +HH:MM or -HH:MM + | # OR + pytz\.FixedOffset\(\d+\) # Match pytz.FixedOffset with integer offset in parentheses + ) # End time_zone group + )? # End optional timezone group + \] # Closing bracket for datetime64 +$""" +PATTERN_PD_DATETIME = re.compile(PD_DATETIME_RGX, re.VERBOSE) +PA_DATETIME_RGX = r"""^ + timestamp\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + (?:, # Begin non-capturing group for optional timezone + \s?tz= # Match "tz=" prefix + (?P # Start named group for timezone + [a-zA-Z\/]* # Match timezone name (e.g., UTC, America/New_York) + (?: # Begin optional non-capturing group for offset + [+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM + )? # End optional offset group + ) # End time_zone group + )? # End optional timezone group + \] # Closing bracket for timestamp + \[pyarrow\] # Literal string "[pyarrow]" +$""" +PATTERN_PA_DATETIME = re.compile(PA_DATETIME_RGX, re.VERBOSE) +PD_DURATION_RGX = r"""^ + timedelta64\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + \] # Closing bracket for timedelta64 +$""" + +PATTERN_PD_DURATION = re.compile(PD_DURATION_RGX, re.VERBOSE) +PA_DURATION_RGX = r"""^ + duration\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + \] # Closing bracket for duration + \[pyarrow\] # Literal string "[pyarrow]" +$""" +PATTERN_PA_DURATION = re.compile(PA_DURATION_RGX, re.VERBOSE) def validate_column_comparand(index: Any, other: Any) -> Any: @@ -223,14 +268,6 @@ def native_to_narwhals_dtype( ) -> DType: dtype = str(native_column.dtype) - pd_datetime_rgx = ( - r"^datetime64\[(?Ps|ms|us|ns)(?:, (?P[a-zA-Z\/]+))?\]$" - ) - pa_datetime_rgx = r"^timestamp\[(?Ps|ms|us|ns)(?:, tz=(?P[a-zA-Z\/]+))?\]\[pyarrow\]$" - - pd_duration_rgx = r"^timedelta64\[(?Ps|ms|us|ns)\]$" - pa_duration_rgx = r"^duration\[(?Ps|ms|us|ns)\]\[pyarrow\]$" - if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: return dtypes.Int64() if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}: @@ -269,14 +306,14 @@ def native_to_narwhals_dtype( return dtypes.Boolean() if dtype == "category" or dtype.startswith("dictionary<"): return dtypes.Categorical() - if (match_ := re.match(pd_datetime_rgx, dtype)) or ( - match_ := re.match(pa_datetime_rgx, dtype) + if (match_ := PATTERN_PD_DATETIME.match(dtype)) or ( + match_ := PATTERN_PA_DATETIME.match(dtype) ): dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] dt_time_zone: str | None = match_.group("time_zone") return dtypes.Datetime(dt_time_unit, dt_time_zone) - if (match_ := re.match(pd_duration_rgx, dtype)) or ( - match_ := re.match(pa_duration_rgx, dtype) + if (match_ := PATTERN_PD_DURATION.match(dtype)) or ( + match_ := PATTERN_PA_DURATION.match(dtype) ): du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] return dtypes.Duration(du_time_unit) diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py index 0d6363aee..2993521b9 100644 --- a/tests/dtypes_test.py +++ b/tests/dtypes_test.py @@ -176,3 +176,24 @@ def test_pandas_inplace_modification_1267(request: pytest.FixtureRequest) -> Non assert snw.dtype == nw.Int64 s[0] = 999.5 assert snw.dtype == nw.Float64 + + +def test_pandas_fixed_offset_1302() -> None: + result = nw.from_native( + pd.Series(pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])), + series_only=True, + ).dtype + if PANDAS_VERSION >= (2,): + assert result == nw.Datetime("ns", "UTC+01:00") + else: # pragma: no cover + assert result == nw.Datetime("ns", "pytz.FixedOffset(60)") + if PANDAS_VERSION >= (2,): + result = nw.from_native( + pd.Series( + pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"]) + ).convert_dtypes(dtype_backend="pyarrow"), + series_only=True, + ).dtype + assert result == nw.Datetime("ns", "+01:00") + else: # pragma: no cover + pass