Skip to content

Commit

Permalink
fix: fixed-offset datetime weren't being parsed (#1303)
Browse files Browse the repository at this point in the history
* fix: fix parsing of fixed-offset timezones

* vresions
  • Loading branch information
MarcoGorelli authored Nov 2, 2024
1 parent 0ec5e00 commit 8763535
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 12 deletions.
61 changes: 49 additions & 12 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,51 @@
Implementation.CUDF,
Implementation.MODIN,
}
PD_DATETIME_RGX = r"""^
datetime64\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
(?:, # Begin non-capturing group for optional timezone
\s* # Optional whitespace after comma
(?P<time_zone> # Start named group for timezone
[a-zA-Z\/]+ # Match timezone name, e.g., UTC, America/New_York
(?:[+-]\d{2}:\d{2})? # Optional offset in format +HH:MM or -HH:MM
| # OR
pytz\.FixedOffset\(\d+\) # Match pytz.FixedOffset with integer offset in parentheses
) # End time_zone group
)? # End optional timezone group
\] # Closing bracket for datetime64
$"""
PATTERN_PD_DATETIME = re.compile(PD_DATETIME_RGX, re.VERBOSE)
PA_DATETIME_RGX = r"""^
timestamp\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
(?:, # Begin non-capturing group for optional timezone
\s?tz= # Match "tz=" prefix
(?P<time_zone> # Start named group for timezone
[a-zA-Z\/]* # Match timezone name (e.g., UTC, America/New_York)
(?: # Begin optional non-capturing group for offset
[+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM
)? # End optional offset group
) # End time_zone group
)? # End optional timezone group
\] # Closing bracket for timestamp
\[pyarrow\] # Literal string "[pyarrow]"
$"""
PATTERN_PA_DATETIME = re.compile(PA_DATETIME_RGX, re.VERBOSE)
PD_DURATION_RGX = r"""^
timedelta64\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
\] # Closing bracket for timedelta64
$"""

PATTERN_PD_DURATION = re.compile(PD_DURATION_RGX, re.VERBOSE)
PA_DURATION_RGX = r"""^
duration\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
\] # Closing bracket for duration
\[pyarrow\] # Literal string "[pyarrow]"
$"""
PATTERN_PA_DURATION = re.compile(PA_DURATION_RGX, re.VERBOSE)


def validate_column_comparand(index: Any, other: Any) -> Any:
Expand Down Expand Up @@ -223,14 +268,6 @@ def native_to_narwhals_dtype(
) -> DType:
dtype = str(native_column.dtype)

pd_datetime_rgx = (
r"^datetime64\[(?P<time_unit>s|ms|us|ns)(?:, (?P<time_zone>[a-zA-Z\/]+))?\]$"
)
pa_datetime_rgx = r"^timestamp\[(?P<time_unit>s|ms|us|ns)(?:, tz=(?P<time_zone>[a-zA-Z\/]+))?\]\[pyarrow\]$"

pd_duration_rgx = r"^timedelta64\[(?P<time_unit>s|ms|us|ns)\]$"
pa_duration_rgx = r"^duration\[(?P<time_unit>s|ms|us|ns)\]\[pyarrow\]$"

if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}:
return dtypes.Int64()
if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}:
Expand Down Expand Up @@ -269,14 +306,14 @@ def native_to_narwhals_dtype(
return dtypes.Boolean()
if dtype == "category" or dtype.startswith("dictionary<"):
return dtypes.Categorical()
if (match_ := re.match(pd_datetime_rgx, dtype)) or (
match_ := re.match(pa_datetime_rgx, dtype)
if (match_ := PATTERN_PD_DATETIME.match(dtype)) or (
match_ := PATTERN_PA_DATETIME.match(dtype)
):
dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
dt_time_zone: str | None = match_.group("time_zone")
return dtypes.Datetime(dt_time_unit, dt_time_zone)
if (match_ := re.match(pd_duration_rgx, dtype)) or (
match_ := re.match(pa_duration_rgx, dtype)
if (match_ := PATTERN_PD_DURATION.match(dtype)) or (
match_ := PATTERN_PA_DURATION.match(dtype)
):
du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
return dtypes.Duration(du_time_unit)
Expand Down
21 changes: 21 additions & 0 deletions tests/dtypes_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,24 @@ def test_pandas_inplace_modification_1267(request: pytest.FixtureRequest) -> Non
assert snw.dtype == nw.Int64
s[0] = 999.5
assert snw.dtype == nw.Float64


def test_pandas_fixed_offset_1302() -> None:
result = nw.from_native(
pd.Series(pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])),
series_only=True,
).dtype
if PANDAS_VERSION >= (2,):
assert result == nw.Datetime("ns", "UTC+01:00")
else: # pragma: no cover
assert result == nw.Datetime("ns", "pytz.FixedOffset(60)")
if PANDAS_VERSION >= (2,):
result = nw.from_native(
pd.Series(
pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])
).convert_dtypes(dtype_backend="pyarrow"),
series_only=True,
).dtype
assert result == nw.Datetime("ns", "+01:00")
else: # pragma: no cover
pass

0 comments on commit 8763535

Please sign in to comment.