From 8e46e4e47a11d37cb7b2b49079307b5a49c57488 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 11 Mar 2024 16:31:25 +0100 Subject: [PATCH 01/50] TEST-#7049: Add some sanity tests with pyarrow-backed pandas dataframes Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/binary.py | 1 + modin/tests/pandas/test_series.py | 88 ++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index b2ffc2e6788..75474bb3f08 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -250,6 +250,7 @@ def try_compute_new_dtypes( try: if infer_dtypes == "bool" or is_bool_dtype(result_dtype): + # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data dtypes = maybe_build_dtypes_series( first, second, dtype=pandas.api.types.pandas_dtype(bool) ) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index b079ce586dd..3530a2268d9 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1387,6 +1387,94 @@ def test_constructor_arrow_extension_array(): df_equals(md_ser.dtypes, pd_ser.dtypes) +def test_pyarrow_constructor(): + pa = pytest.importorskip("pyarrow") + data = list("abcd") + _ = pd.Series(data, dtype="string[pyarrow]") + _ = pd.Series(data, dtype=pd.ArrowDtype(pa.string())) + + list_str_type = pa.list_(pa.string()) + _ = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type)) + + from datetime import time + + _ = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us"))) + + from decimal import Decimal + + decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2)) + + data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]] + + _ = pd.DataFrame(data, dtype=decimal_type) + + +def test_pyarrow_array_retrieve(): + pa = pytest.importorskip("pyarrow") + modin_series, pandas_series = create_test_series( + [1, 2, None], dtype="uint8[pyarrow]" + ) + eval_general( + modin_series, + pandas_series, + lambda ser: pa.array(ser), + raising_exceptions=(Exception,), + ) + + +def test_pyarrow_functions(): + pytest.importorskip("pyarrow") + modin_series, pandas_series = create_test_series( + [-1.545, 0.211, None], dtype="float32[pyarrow]" + ) + df_equals(modin_series.mean(), pandas_series.mean()) + + def comparator(df1, df2): + df_equals(df1, df2) + df_equals(df1.dtypes, df2.dtypes) + + eval_general( + modin_series, + pandas_series, + lambda ser: ser + + (modin_series if isinstance(ser, pd.Series) else pandas_series), + comparator=comparator, + raising_exceptions=(Exception,), + ) + + eval_general( + modin_series, + pandas_series, + lambda ser: ser > (ser + 1), + comparator=comparator, + raising_exceptions=(Exception,), + ) + + eval_general( + modin_series, + pandas_series, + lambda ser: ser.dropna(), + comparator=comparator, + raising_exceptions=(Exception,), + ) + + eval_general( + modin_series, + pandas_series, + lambda ser: ser.isna(), + comparator=comparator, + raising_exceptions=(Exception,), + ) + + eval_general( + modin_series, + pandas_series, + lambda ser: ser.fillna(0), + comparator=comparator, + raising_exceptions=(Exception,), + ) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_copy(data): modin_series, pandas_series = create_test_series(data) From 6814c6eddee2b1004d8b8f24495e8c410cb16c7c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 2 Apr 2024 15:59:51 +0200 Subject: [PATCH 02/50] fixes Signed-off-by: Anatoly Myachev --- modin/tests/pandas/test_series.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index 3530a2268d9..36844772ada 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1418,7 +1418,6 @@ def test_pyarrow_array_retrieve(): modin_series, pandas_series, lambda ser: pa.array(ser), - raising_exceptions=(Exception,), ) @@ -1439,7 +1438,6 @@ def comparator(df1, df2): lambda ser: ser + (modin_series if isinstance(ser, pd.Series) else pandas_series), comparator=comparator, - raising_exceptions=(Exception,), ) eval_general( @@ -1447,7 +1445,6 @@ def comparator(df1, df2): pandas_series, lambda ser: ser > (ser + 1), comparator=comparator, - raising_exceptions=(Exception,), ) eval_general( @@ -1455,7 +1452,6 @@ def comparator(df1, df2): pandas_series, lambda ser: ser.dropna(), comparator=comparator, - raising_exceptions=(Exception,), ) eval_general( @@ -1463,7 +1459,6 @@ def comparator(df1, df2): pandas_series, lambda ser: ser.isna(), comparator=comparator, - raising_exceptions=(Exception,), ) eval_general( @@ -1471,7 +1466,6 @@ def comparator(df1, df2): pandas_series, lambda ser: ser.fillna(0), comparator=comparator, - raising_exceptions=(Exception,), ) From e1dbc69f0572a0db2fc41d37bd574ab786326672 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 12:50:02 +0200 Subject: [PATCH 03/50] fix Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/binary.py | 2 +- modin/tests/pandas/test_series.py | 12 ------------ 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index 75474bb3f08..a5fc89dd573 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -250,7 +250,7 @@ def try_compute_new_dtypes( try: if infer_dtypes == "bool" or is_bool_dtype(result_dtype): - # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data + # FIXME: can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data dtypes = maybe_build_dtypes_series( first, second, dtype=pandas.api.types.pandas_dtype(bool) ) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index 36844772ada..8b4ec960de2 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1409,18 +1409,6 @@ def test_pyarrow_constructor(): _ = pd.DataFrame(data, dtype=decimal_type) -def test_pyarrow_array_retrieve(): - pa = pytest.importorskip("pyarrow") - modin_series, pandas_series = create_test_series( - [1, 2, None], dtype="uint8[pyarrow]" - ) - eval_general( - modin_series, - pandas_series, - lambda ser: pa.array(ser), - ) - - def test_pyarrow_functions(): pytest.importorskip("pyarrow") modin_series, pandas_series = create_test_series( From 7b925a50c5f85ff8df8deae41b76295b56946beb Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 13:15:33 +0200 Subject: [PATCH 04/50] cleanup Signed-off-by: Anatoly Myachev --- .../pandas/dataframe/test_map_metadata.py | 9 ++++++ modin/tests/pandas/test_series.py | 32 +++++++------------ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index b6dc1686ff8..ab7a7fa4a31 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -11,6 +11,8 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +from decimal import Decimal + import matplotlib import numpy as np import pandas @@ -1797,6 +1799,13 @@ def test_constructor(data): df_equals(pandas_df, modin_df) +def test_pyarrow_constructor(): + pa = pytest.importorskip("pyarrow") + + data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]] + df_equals(*create_test_dfs(data, dtype=pd.ArrowDtype(pa.decimal128(3, scale=2)))) + + @pytest.mark.parametrize( "data", [ diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index 0d17823bf61..e5ffad9a7ee 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1390,23 +1390,12 @@ def test_constructor_arrow_extension_array(): def test_pyarrow_constructor(): pa = pytest.importorskip("pyarrow") data = list("abcd") - _ = pd.Series(data, dtype="string[pyarrow]") - _ = pd.Series(data, dtype=pd.ArrowDtype(pa.string())) + df_equals(*create_test_series(data, dtype="string[pyarrow]")) + df_equals(*create_test_series(data, dtype=pd.ArrowDtype(pa.string()))) + data = [["hello"], ["there"]] list_str_type = pa.list_(pa.string()) - _ = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type)) - - from datetime import time - - _ = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us"))) - - from decimal import Decimal - - decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2)) - - data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]] - - _ = pd.DataFrame(data, dtype=decimal_type) + df_equals(*create_test_series(data, dtype=pd.ArrowDtype(list_str_type))) def test_pyarrow_functions(): @@ -1428,12 +1417,13 @@ def comparator(df1, df2): comparator=comparator, ) - eval_general( - modin_series, - pandas_series, - lambda ser: ser > (ser + 1), - comparator=comparator, - ) + # FIXME: https://github.com/modin-project/modin/issues/7203 + # eval_general( + # modin_series, + # pandas_series, + # lambda ser: ser > (ser + 1), + # comparator=comparator, + # ) eval_general( modin_series, From 23003c580487971ae5ec82d8b9a7bfb58825f76b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 13:18:01 +0200 Subject: [PATCH 05/50] fix comment Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/binary.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index a5fc89dd573..6af31ab826c 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -250,7 +250,8 @@ def try_compute_new_dtypes( try: if infer_dtypes == "bool" or is_bool_dtype(result_dtype): - # FIXME: can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data + # FIXME: https://github.com/modin-project/modin/issues/7203 + # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data dtypes = maybe_build_dtypes_series( first, second, dtype=pandas.api.types.pandas_dtype(bool) ) From cc2a5ab8a4c14a2bd736cc390b4dc15b1003e328 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 14:06:55 +0200 Subject: [PATCH 06/50] skip some cases for HDK Signed-off-by: Anatoly Myachev --- modin/tests/pandas/test_series.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index e5ffad9a7ee..fe3e3bc2e33 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1409,13 +1409,15 @@ def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) - eval_general( - modin_series, - pandas_series, - lambda ser: ser - + (modin_series if isinstance(ser, pd.Series) else pandas_series), - comparator=comparator, - ) + if StorageFormat.get() != "Hdk": + # FIXME: HDK should also work in this case + eval_general( + modin_series, + pandas_series, + lambda ser: ser + + (modin_series if isinstance(ser, pd.Series) else pandas_series), + comparator=comparator, + ) # FIXME: https://github.com/modin-project/modin/issues/7203 # eval_general( @@ -1439,12 +1441,14 @@ def comparator(df1, df2): comparator=comparator, ) - eval_general( - modin_series, - pandas_series, - lambda ser: ser.fillna(0), - comparator=comparator, - ) + if StorageFormat.get() != "Hdk": + # FIXME: HDK should also work in this case + eval_general( + modin_series, + pandas_series, + lambda ser: ser.fillna(0), + comparator=comparator, + ) def test_pyarrow_array_retrieve(): From b710865e9dbd09fdbb30772cf76110e2e00467df Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 14:42:14 +0200 Subject: [PATCH 07/50] FEAT-#7203: Make sure modin works correctly with pandas, which uses pyarrow as a backend Signed-off-by: Anatoly Myachev --- .../hdk_on_native/dataframe/utils.py | 6 ++---- modin/tests/pandas/test_series.py | 16 +++++++--------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py index f99cc256baa..4f749cf0e3b 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py @@ -23,7 +23,7 @@ import pandas import pyarrow as pa from pandas.core.arrays.arrow.extension_types import ArrowIntervalType -from pandas.core.dtypes.common import _get_dtype, is_string_dtype +from pandas.core.dtypes.common import _get_dtype from pyarrow.types import is_dictionary from modin.pandas.indexing import is_range_like @@ -504,9 +504,7 @@ def to_arrow_type(dtype) -> pa.lib.DataType: ------- pa.lib.DataType """ - if is_string_dtype(dtype): - return pa.from_numpy_dtype(str) - return pa.from_numpy_dtype(dtype) + return pandas.api.types.pandas_dtype(dtype).pyarrow_dtype def get_common_arrow_type(t1: pa.lib.DataType, t2: pa.lib.DataType) -> pa.lib.DataType: diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index fe3e3bc2e33..89722a62e6e 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1409,15 +1409,13 @@ def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) - if StorageFormat.get() != "Hdk": - # FIXME: HDK should also work in this case - eval_general( - modin_series, - pandas_series, - lambda ser: ser - + (modin_series if isinstance(ser, pd.Series) else pandas_series), - comparator=comparator, - ) + eval_general( + modin_series, + pandas_series, + lambda ser: ser + + (modin_series if isinstance(ser, pd.Series) else pandas_series), + comparator=comparator, + ) # FIXME: https://github.com/modin-project/modin/issues/7203 # eval_general( From 310f12a7c6a45c4d02b8388df3990d03ccb3f015 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 15:22:24 +0200 Subject: [PATCH 08/50] don't use numpy types directly Signed-off-by: Anatoly Myachev --- modin/core/dataframe/base/dataframe/utils.py | 3 +- .../dataframe/pandas/dataframe/dataframe.py | 8 +- .../storage_formats/pandas/query_compiler.py | 122 +++++++++--------- .../hdk_on_native/calcite_serializer.py | 1 + modin/numpy/indexing.py | 4 +- modin/pandas/base.py | 12 +- modin/pandas/indexing.py | 4 +- 7 files changed, 76 insertions(+), 78 deletions(-) diff --git a/modin/core/dataframe/base/dataframe/utils.py b/modin/core/dataframe/base/dataframe/utils.py index adc159a1a0f..c8e3f742193 100644 --- a/modin/core/dataframe/base/dataframe/utils.py +++ b/modin/core/dataframe/base/dataframe/utils.py @@ -21,7 +21,6 @@ from enum import Enum from typing import Dict, List, Sequence, Tuple, cast -import numpy as np import pandas from pandas._typing import IndexLabel from pandas.api.types import is_scalar @@ -170,7 +169,7 @@ def is_trivial_index(index: pandas.Index) -> bool: return True if isinstance(index, pandas.RangeIndex): return index.start == 0 and index.step == 1 - if not (isinstance(index, pandas.Index) and index.dtype == np.int64): + if not (isinstance(index, pandas.Index) and index.dtype == "int64"): return False return ( index.is_monotonic_increasing diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 6539e4d286f..bf13942b594 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -1175,7 +1175,7 @@ def _take_2d_positional( + f"received: {type(indexer)}", ) if isinstance(indexer, list): - indexer = np.array(indexer, dtype=np.int64) + indexer = np.array(indexer, dtype="int64") indexers.append(indexer) row_positions, col_positions = indexers @@ -1836,13 +1836,13 @@ def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False): return dict_of_slices if isinstance(indices, list): # Converting python list to numpy for faster processing - indices = np.array(indices, dtype=np.int64) + indices = np.array(indices, dtype="int64") # Fasttrack empty numpy array if isinstance(indices, np.ndarray) and indices.size == 0: # This will help preserve metadata stored in empty dataframes (indexes and dtypes) # Otherwise, we will get an empty `new_partitions` array, from which it will # no longer be possible to obtain metadata - return dict([(0, np.array([], dtype=np.int64))]) + return dict([(0, np.array([], dtype="int64"))]) negative_mask = np.less(indices, 0) has_negative = np.any(negative_mask) if has_negative: @@ -1850,7 +1850,7 @@ def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False): indices = ( indices.copy() if isinstance(indices, np.ndarray) - else np.array(indices, dtype=np.int64) + else np.array(indices, dtype="int64") ) indices[negative_mask] = indices[negative_mask] % len(self.get_axis(axis)) # If the `indices` array was modified because of the negative indices conversion diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 7fd07d5fe91..ecdd0704129 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -1840,7 +1840,7 @@ def isin_func(df, values): ) return res - return Map.register(isin_func, shape_hint=shape_hint, dtypes=np.bool_)( + return Map.register(isin_func, shape_hint=shape_hint, dtypes="bool")( self, values ) @@ -1849,7 +1849,7 @@ def isin_func(df, values): conj = Map.register(lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df))) convert_dtypes = Fold.register(pandas.DataFrame.convert_dtypes) invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy") - isna = Map.register(pandas.DataFrame.isna, dtypes=np.bool_) + isna = Map.register(pandas.DataFrame.isna, dtypes="bool") _isfinite = Map.register( lambda df, *args, **kwargs: pandas.DataFrame(np.isfinite(df, *args, **kwargs)), dtypes=np.bool_, @@ -1889,7 +1889,7 @@ def isin_func(df, values): lambda df, *args, **kwargs: pandas.DataFrame(np.exp(df, *args, **kwargs)) ) # Needed for numpy API negative = Map.register(pandas.DataFrame.__neg__) - notna = Map.register(pandas.DataFrame.notna, dtypes=np.bool_) + notna = Map.register(pandas.DataFrame.notna, dtypes="bool") round = Map.register(pandas.DataFrame.round) replace = Map.register(pandas.DataFrame.replace) series_view = Map.register( @@ -1915,24 +1915,24 @@ def isin_func(df, values): str_capitalize = Map.register(_str_map("capitalize"), dtypes="copy") str_center = Map.register(_str_map("center"), dtypes="copy") - str_contains = Map.register(_str_map("contains"), dtypes=np.bool_) - str_count = Map.register(_str_map("count"), dtypes=int) - str_endswith = Map.register(_str_map("endswith"), dtypes=np.bool_) - str_find = Map.register(_str_map("find"), dtypes=np.int64) + str_contains = Map.register(_str_map("contains"), dtypes="bool") + str_count = Map.register(_str_map("count"), dtypes="int64") + str_endswith = Map.register(_str_map("endswith"), dtypes="bool") + str_find = Map.register(_str_map("find"), dtypes="int64") str_findall = Map.register(_str_map("findall"), dtypes="copy") str_get = Map.register(_str_map("get"), dtypes="copy") - str_index = Map.register(_str_map("index"), dtypes=np.int64) - str_isalnum = Map.register(_str_map("isalnum"), dtypes=np.bool_) - str_isalpha = Map.register(_str_map("isalpha"), dtypes=np.bool_) - str_isdecimal = Map.register(_str_map("isdecimal"), dtypes=np.bool_) - str_isdigit = Map.register(_str_map("isdigit"), dtypes=np.bool_) - str_islower = Map.register(_str_map("islower"), dtypes=np.bool_) - str_isnumeric = Map.register(_str_map("isnumeric"), dtypes=np.bool_) - str_isspace = Map.register(_str_map("isspace"), dtypes=np.bool_) - str_istitle = Map.register(_str_map("istitle"), dtypes=np.bool_) - str_isupper = Map.register(_str_map("isupper"), dtypes=np.bool_) + str_index = Map.register(_str_map("index"), dtypes="int64") + str_isalnum = Map.register(_str_map("isalnum"), dtypes="bool") + str_isalpha = Map.register(_str_map("isalpha"), dtypes="bool") + str_isdecimal = Map.register(_str_map("isdecimal"), dtypes="bool") + str_isdigit = Map.register(_str_map("isdigit"), dtypes="bool") + str_islower = Map.register(_str_map("islower"), dtypes="bool") + str_isnumeric = Map.register(_str_map("isnumeric"), dtypes="bool") + str_isspace = Map.register(_str_map("isspace"), dtypes="bool") + str_istitle = Map.register(_str_map("istitle"), dtypes="bool") + str_isupper = Map.register(_str_map("isupper"), dtypes="bool") str_join = Map.register(_str_map("join"), dtypes="copy") - str_len = Map.register(_str_map("len"), dtypes=int) + str_len = Map.register(_str_map("len"), dtypes="int64") str_ljust = Map.register(_str_map("ljust"), dtypes="copy") str_lower = Map.register(_str_map("lower"), dtypes="copy") str_lstrip = Map.register(_str_map("lstrip"), dtypes="copy") @@ -1961,8 +1961,8 @@ def str_extract(self, pat, flags, expand): return qc str_replace = Map.register(_str_map("replace"), dtypes="copy", shape_hint="column") - str_rfind = Map.register(_str_map("rfind"), dtypes=np.int64, shape_hint="column") - str_rindex = Map.register(_str_map("rindex"), dtypes=np.int64, shape_hint="column") + str_rfind = Map.register(_str_map("rfind"), dtypes="int64", shape_hint="column") + str_rindex = Map.register(_str_map("rindex"), dtypes="int64", shape_hint="column") str_rjust = Map.register(_str_map("rjust"), dtypes="copy", shape_hint="column") _str_rpartition = Map.register( _str_map("rpartition"), dtypes="copy", shape_hint="column" @@ -1996,7 +1996,7 @@ def str_split(self, pat=None, n=-1, expand=False, regex=None): return self._str_split(pat=pat, n=n, expand=False, regex=regex) str_startswith = Map.register( - _str_map("startswith"), dtypes=np.bool_, shape_hint="column" + _str_map("startswith"), dtypes="bool", shape_hint="column" ) str_strip = Map.register(_str_map("strip"), dtypes="copy", shape_hint="column") str_swapcase = Map.register( @@ -2068,51 +2068,49 @@ def searchsorted(df): # Dt map partitions operations - dt_date = Map.register(_dt_prop_map("date"), dtypes=np.object_) - dt_time = Map.register(_dt_prop_map("time"), dtypes=np.object_) - dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes=np.object_) - dt_year = Map.register(_dt_prop_map("year"), dtypes=np.int32) - dt_month = Map.register(_dt_prop_map("month"), dtypes=np.int32) - dt_day = Map.register(_dt_prop_map("day"), dtypes=np.int32) - dt_hour = Map.register(_dt_prop_map("hour"), dtypes=np.int64) - dt_minute = Map.register(_dt_prop_map("minute"), dtypes=np.int64) - dt_second = Map.register(_dt_prop_map("second"), dtypes=np.int64) - dt_microsecond = Map.register(_dt_prop_map("microsecond"), dtypes=np.int64) - dt_nanosecond = Map.register(_dt_prop_map("nanosecond"), dtypes=np.int64) - dt_dayofweek = Map.register(_dt_prop_map("dayofweek"), dtypes=np.int64) - dt_weekday = Map.register(_dt_prop_map("weekday"), dtypes=np.int64) - dt_dayofyear = Map.register(_dt_prop_map("dayofyear"), dtypes=np.int64) - dt_quarter = Map.register(_dt_prop_map("quarter"), dtypes=np.int64) - dt_is_month_start = Map.register(_dt_prop_map("is_month_start"), dtypes=np.bool_) - dt_is_month_end = Map.register(_dt_prop_map("is_month_end"), dtypes=np.bool_) - dt_is_quarter_start = Map.register( - _dt_prop_map("is_quarter_start"), dtypes=np.bool_ - ) - dt_is_quarter_end = Map.register(_dt_prop_map("is_quarter_end"), dtypes=np.bool_) - dt_is_year_start = Map.register(_dt_prop_map("is_year_start"), dtypes=np.bool_) - dt_is_year_end = Map.register(_dt_prop_map("is_year_end"), dtypes=np.bool_) - dt_is_leap_year = Map.register(_dt_prop_map("is_leap_year"), dtypes=np.bool_) - dt_daysinmonth = Map.register(_dt_prop_map("daysinmonth"), dtypes=np.int64) - dt_days_in_month = Map.register(_dt_prop_map("days_in_month"), dtypes=np.int64) + dt_date = Map.register(_dt_prop_map("date"), dtypes="object") + dt_time = Map.register(_dt_prop_map("time"), dtypes="object") + dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes="object") + dt_year = Map.register(_dt_prop_map("year"), dtypes="int32") + dt_month = Map.register(_dt_prop_map("month"), dtypes="int32") + dt_day = Map.register(_dt_prop_map("day"), dtypes="int32") + dt_hour = Map.register(_dt_prop_map("hour"), dtypes="int64") + dt_minute = Map.register(_dt_prop_map("minute"), dtypes="int64") + dt_second = Map.register(_dt_prop_map("second"), dtypes="int64") + dt_microsecond = Map.register(_dt_prop_map("microsecond"), dtypes="int64") + dt_nanosecond = Map.register(_dt_prop_map("nanosecond"), dtypes="int64") + dt_dayofweek = Map.register(_dt_prop_map("dayofweek"), dtypes="int64") + dt_weekday = Map.register(_dt_prop_map("weekday"), dtypes="int64") + dt_dayofyear = Map.register(_dt_prop_map("dayofyear"), dtypes="int64") + dt_quarter = Map.register(_dt_prop_map("quarter"), dtypes="int64") + dt_is_month_start = Map.register(_dt_prop_map("is_month_start"), dtypes="bool") + dt_is_month_end = Map.register(_dt_prop_map("is_month_end"), dtypes="bool") + dt_is_quarter_start = Map.register(_dt_prop_map("is_quarter_start"), dtypes="bool") + dt_is_quarter_end = Map.register(_dt_prop_map("is_quarter_end"), dtypes="bool") + dt_is_year_start = Map.register(_dt_prop_map("is_year_start"), dtypes="bool") + dt_is_year_end = Map.register(_dt_prop_map("is_year_end"), dtypes="bool") + dt_is_leap_year = Map.register(_dt_prop_map("is_leap_year"), dtypes="bool") + dt_daysinmonth = Map.register(_dt_prop_map("daysinmonth"), dtypes="int64") + dt_days_in_month = Map.register(_dt_prop_map("days_in_month"), dtypes="int64") dt_asfreq = Map.register(_dt_func_map("asfreq")) dt_to_period = Map.register(_dt_func_map("to_period")) - dt_to_pydatetime = Map.register(_dt_func_map("to_pydatetime"), dtypes=np.object_) + dt_to_pydatetime = Map.register(_dt_func_map("to_pydatetime"), dtypes="object") dt_tz_localize = Map.register(_dt_func_map("tz_localize")) dt_tz_convert = Map.register(_dt_func_map("tz_convert")) dt_normalize = Map.register(_dt_func_map("normalize")) - dt_strftime = Map.register(_dt_func_map("strftime"), dtypes=np.object_) + dt_strftime = Map.register(_dt_func_map("strftime"), dtypes="object") dt_round = Map.register(_dt_func_map("round")) dt_floor = Map.register(_dt_func_map("floor")) dt_ceil = Map.register(_dt_func_map("ceil")) - dt_month_name = Map.register(_dt_func_map("month_name"), dtypes=np.object_) - dt_day_name = Map.register(_dt_func_map("day_name"), dtypes=np.object_) - dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes=np.object_) - dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes=np.float64) - dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes=np.int64) - dt_days = Map.register(_dt_prop_map("days"), dtypes=np.int64) - dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes=np.int64) - dt_nanoseconds = Map.register(_dt_prop_map("nanoseconds"), dtypes=np.int64) - dt_qyear = Map.register(_dt_prop_map("qyear"), dtypes=np.int64) + dt_month_name = Map.register(_dt_func_map("month_name"), dtypes="object") + dt_day_name = Map.register(_dt_func_map("day_name"), dtypes="object") + dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes="object") + dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes="float64") + dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes="int64") + dt_days = Map.register(_dt_prop_map("days"), dtypes="int64") + dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes="int64") + dt_nanoseconds = Map.register(_dt_prop_map("nanoseconds"), dtypes="int64") + dt_qyear = Map.register(_dt_prop_map("qyear"), dtypes="int64") dt_start_time = Map.register(_dt_prop_map("start_time")) dt_end_time = Map.register(_dt_prop_map("end_time")) dt_to_timestamp = Map.register(_dt_func_map("to_timestamp")) @@ -2284,7 +2282,7 @@ def map_func(df): # pragma: no cover n_rows = df.shape[0] df_mask = np.isfinite(df) - result = np.empty((n_rows, n_cols), dtype=np.float64) + result = np.empty((n_rows, n_cols), dtype="float64") for i in range(n_rows): df_ith_row = df[i] @@ -2636,7 +2634,7 @@ def quantile_builder(df, **kwargs): lambda df: quantile_builder(df, **kwargs), new_index=q_index, new_columns=new_columns, - dtypes=np.float64, + dtypes="float64", ) result = self.__constructor__(new_modin_frame) return result.transpose() if axis == 1 else result @@ -2653,7 +2651,7 @@ def rank(self, **kwargs): if not numeric_only else None ), - dtypes=np.float64, + dtypes="float64", sync_labels=False, ) return self.__constructor__(new_modin_frame) @@ -3163,7 +3161,7 @@ def _compute_duplicated(df): # pragma: no cover func=_compute_duplicated, new_index=self._modin_frame.copy_index_cache(), new_columns=[MODIN_UNNAMED_SERIES_LABEL], - dtypes=np.bool_, + dtypes="bool", keep_partitioning=True, ) return self.__constructor__(new_modin_frame, shape_hint="column") diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py index 7099751dafe..b00e73dc745 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py @@ -67,6 +67,7 @@ class CalciteSerializer: "datetime64": "TIMESTAMP", } + # TODO: Is it necessary to use more general types here (not dependent on NumPy)? _INT_OPTS = { np.int8: ("TINYINT", 3), np.int16: ("SMALLINT", 5), diff --git a/modin/numpy/indexing.py b/modin/numpy/indexing.py index b598577a34d..4223ae3e513 100644 --- a/modin/numpy/indexing.py +++ b/modin/numpy/indexing.py @@ -214,7 +214,7 @@ def boolean_mask_to_numeric(indexer): # `itertools.compress` masks `data` with the `selectors` mask, # works about ~10% faster than a pure list comprehension itertools.compress(data=range(len(indexer)), selectors=indexer), - dtype=np.int64, + dtype="int64", ) @@ -585,7 +585,7 @@ def _compute_lookup(self, row_loc, col_loc): # `Index.__getitem__` works much faster with numpy arrays than with python lists, # so although we lose some time here on converting to numpy, `Index.__getitem__` # speedup covers the loss that we gain here. - axis_loc = np.array(axis_loc, dtype=np.int64) + axis_loc = np.array(axis_loc, dtype="int64") # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation # if there're no negative indices and so they don't not depend on the axis length. if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any(): diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 282d27eaf67..81dc3b03a46 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1530,7 +1530,7 @@ def eq(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get equality of `BasePandasDataset` and `other`, element-wise (binary operator `eq`). """ - return self._binary_op("eq", other, axis=axis, level=level, dtypes=np.bool_) + return self._binary_op("eq", other, axis=axis, level=level, dtypes="bool") def explode(self, column, ignore_index: bool = False): # noqa: PR01, RT01, D200 """ @@ -1831,7 +1831,7 @@ def ge(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get greater than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ge`). """ - return self._binary_op("ge", other, axis=axis, level=level, dtypes=np.bool_) + return self._binary_op("ge", other, axis=axis, level=level, dtypes="bool") def get(self, key, default=None): # noqa: PR01, RT01, D200 """ @@ -1847,7 +1847,7 @@ def gt(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get greater than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `gt`). """ - return self._binary_op("gt", other, axis=axis, level=level, dtypes=np.bool_) + return self._binary_op("gt", other, axis=axis, level=level, dtypes="bool") def head(self, n=5): # noqa: PR01, RT01, D200 """ @@ -1979,13 +1979,13 @@ def le(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get less than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `le`). """ - return self._binary_op("le", other, axis=axis, level=level, dtypes=np.bool_) + return self._binary_op("le", other, axis=axis, level=level, dtypes="bool") def lt(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get less than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `lt`). """ - return self._binary_op("lt", other, axis=axis, level=level, dtypes=np.bool_) + return self._binary_op("lt", other, axis=axis, level=level, dtypes="bool") @property def loc(self): # noqa: RT01, D200 @@ -2194,7 +2194,7 @@ def ne(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get Not equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ne`). """ - return self._binary_op("ne", other, axis=axis, level=level, dtypes=np.bool_) + return self._binary_op("ne", other, axis=axis, level=level, dtypes="bool") def notna(self): # noqa: RT01, D200 """ diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index d901b6dac99..316a75f82a7 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -232,7 +232,7 @@ def boolean_mask_to_numeric(indexer): # `itertools.compress` masks `data` with the `selectors` mask, # works about ~10% faster than a pure list comprehension itertools.compress(data=range(len(indexer)), selectors=indexer), - dtype=np.int64, + dtype="int64", ) @@ -1130,7 +1130,7 @@ def _compute_lookup(self, row_loc, col_loc): # `Index.__getitem__` works much faster with numpy arrays than with python lists, # so although we lose some time here on converting to numpy, `Index.__getitem__` # speedup covers the loss that we gain here. - axis_loc = np.array(axis_loc, dtype=np.int64) + axis_loc = np.array(axis_loc, dtype="int64") # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation # if there're no negative indices and so they don't not depend on the axis length. if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any(): From cb90479c826b465349fbf7c08181190dfa4715e4 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 16:39:46 +0200 Subject: [PATCH 09/50] try another dtype_backend Signed-off-by: Anatoly Myachev --- modin/tests/pandas/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py index f374071cef8..f34ff0e7c94 100644 --- a/modin/tests/pandas/utils.py +++ b/modin/tests/pandas/utils.py @@ -1088,8 +1088,13 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs): def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, pandas.DataFrame]: post_fn = kwargs.pop("post_fn", lambda df: df) + post_fn2 = lambda df: post_fn(df).convert_dtypes( + dtype_backend="dtype_backend" + ) # noqa: E731 return tuple( - map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)]) + map( + post_fn2, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)] + ) ) @@ -1103,6 +1108,9 @@ def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Se if sort: modin_series = modin_series.sort_values().reset_index(drop=True) pandas_series = pandas_series.sort_values().reset_index(drop=True) + + modin_series = modin_series.convert_dtypes(dtype_backend="dtype_backend") + pandas_series = pandas_series.convert_dtypes(dtype_backend="dtype_backend") return modin_series, pandas_series From f9b25607b6267223f7f362b41549beb693860113 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 16:44:17 +0200 Subject: [PATCH 10/50] fix Signed-off-by: Anatoly Myachev --- modin/tests/pandas/utils.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py index f34ff0e7c94..4a9be1dd58a 100644 --- a/modin/tests/pandas/utils.py +++ b/modin/tests/pandas/utils.py @@ -1087,14 +1087,12 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs): def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, pandas.DataFrame]: - post_fn = kwargs.pop("post_fn", lambda df: df) - post_fn2 = lambda df: post_fn(df).convert_dtypes( - dtype_backend="dtype_backend" - ) # noqa: E731 + post_fn = kwargs.pop("post_fn", None) + + if post_fn is None: + post_fn = lambda df: df.convert_dtypes(dtype_backend="pyarrow") # noqa: E731 return tuple( - map( - post_fn2, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)] - ) + map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)]) ) @@ -1109,8 +1107,8 @@ def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Se modin_series = modin_series.sort_values().reset_index(drop=True) pandas_series = pandas_series.sort_values().reset_index(drop=True) - modin_series = modin_series.convert_dtypes(dtype_backend="dtype_backend") - pandas_series = pandas_series.convert_dtypes(dtype_backend="dtype_backend") + modin_series = modin_series.convert_dtypes(dtype_backend="pyarrow") + pandas_series = pandas_series.convert_dtypes(dtype_backend="pyarrow") return modin_series, pandas_series From ddcda4ff88abbe94e3e2f60b575eb93269819431 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 2 May 2024 16:50:41 +0200 Subject: [PATCH 11/50] fixes Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/binary.py | 17 +++++++--- .../dataframe/pandas/dataframe/dataframe.py | 9 ++++- .../core/dataframe/pandas/metadata/dtypes.py | 33 ++++++++++--------- modin/tests/pandas/dataframe/test_default.py | 2 +- modin/tests/pandas/utils.py | 2 ++ 5 files changed, 41 insertions(+), 22 deletions(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index 6af31ab826c..5138d728aba 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -179,7 +179,7 @@ def maybe_build_dtypes_series( First operand for which the binary operation would be performed later. second : PandasQueryCompiler, list-like or scalar Second operand for which the binary operation would be performed later. - dtype : np.dtype + dtype : pandas supported dtype Dtype of the result. trigger_computations : bool, default: False Whether to trigger computation of the lazy metadata for `first` and `second`. @@ -250,10 +250,19 @@ def try_compute_new_dtypes( try: if infer_dtypes == "bool" or is_bool_dtype(result_dtype): - # FIXME: https://github.com/modin-project/modin/issues/7203 - # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data + # dataframe can contain types of different backends at the same time, for example: + # (Pdb) (pandas.DataFrame([[1,2,3], [4,5,6]]).astype({0: "int64[pyarrow]"}) > 4).dtypes + # 0 bool[pyarrow] + # 1 bool + # 2 bool + # dtype: object + backend = "" + if any("pyarrow" in str(x) for x in first.dtypes) or any( + "pyarrow" in str(x) for x in second.dtypes + ): + backend = "pyarrow" dtypes = maybe_build_dtypes_series( - first, second, dtype=pandas.api.types.pandas_dtype(bool) + first, second, dtype=pandas.api.types.pandas_dtype(f"bool[{backend}]") ) elif infer_dtypes == "common_cast": dtypes = maybe_compute_dtypes_common_cast( diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index bf13942b594..c0989b246c4 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -1269,8 +1269,15 @@ def _take_2d_positional( new_dtypes = self.dtypes.iloc[monotonic_col_idx] elif isinstance(self._dtypes, ModinDtypes): try: + supported_monotonic_col_idx = monotonic_col_idx + if isinstance(monotonic_col_idx, slice): + supported_monotonic_col_idx = pandas.RangeIndex( + monotonic_col_idx.start, + monotonic_col_idx.stop, + monotonic_col_idx.step, + ).to_list() new_dtypes = self._dtypes.lazy_get( - monotonic_col_idx, numeric_index=True + supported_monotonic_col_idx, numeric_index=True ) # can raise either on missing cache or on duplicated labels except (ValueError, NotImplementedError): diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index b904c6fbff6..88f575d7288 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -496,21 +496,18 @@ def _merge_dtypes( # in the 'dtypes_matrix' series = pandas.Series(dtypes, name=i) dtypes_matrix = pandas.concat([dtypes_matrix, series], axis=1) - dtypes_matrix.fillna( - value={ - # If we encountered a 'NaN' while 'val' describes all the columns, then - # it means, that the missing columns for this instance will be filled with NaNs (floats), - # otherwise, it may indicate missing columns that this 'val' has no info about, - # meaning that we shouldn't try computing a new dtype for this column, - # so marking it as 'unknown' - i: ( - pandas.api.types.pandas_dtype(float) - if val._know_all_names and val._remaining_dtype is None - else "unknown" - ) - }, - inplace=True, - ) + if val._know_all_names and val._remaining_dtype is None: + dtypes_matrix.fillna( + value={ + # If we encountered a 'NaN' while 'val' describes all the columns, then + # it means, that the missing columns for this instance will be filled with NaNs (floats), + # otherwise, it may indicate missing columns that this 'val' has no info about, + # meaning that we shouldn't try computing a new dtype for this column, + # so marking it as 'unknown' + i: "unknown", + }, + inplace=True, + ) elif isinstance(val, pandas.Series): dtypes_matrix = pandas.concat([dtypes_matrix, val], axis=1) elif val is None: @@ -531,7 +528,11 @@ def _merge_dtypes( def combine_dtypes(row): if (row == "unknown").any(): return "unknown" - row = row.fillna(pandas.api.types.pandas_dtype("float")) + if any("pyarrow" in str(x) for x in row): + # nans can be stored not only in float types, for example in `bool[pyarrow]` + row = row[~row.isna()] + else: + row = row.fillna(pandas.api.types.pandas_dtype("float")) return find_common_type(list(row.values)) dtypes = dtypes_matrix.apply(combine_dtypes, axis=1) diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 28c2c20a53e..40e14eb55c9 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -715,7 +715,7 @@ def test_pivot_table_data(data, index, columns, values, aggfunc, request): "callable_tree_reduce_func" in request.node.callspec.id and "int_data" in request.node.callspec.id ): - expected_exception = TypeError("'numpy.float64' object is not callable") + expected_exception = TypeError("'float' object is not callable") eval_general( md_df, diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py index 4a9be1dd58a..b88a5d0a0b0 100644 --- a/modin/tests/pandas/utils.py +++ b/modin/tests/pandas/utils.py @@ -1090,6 +1090,7 @@ def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, pandas.DataFrame]: post_fn = kwargs.pop("post_fn", None) if post_fn is None: + # TODO: REVERT ME post_fn = lambda df: df.convert_dtypes(dtype_backend="pyarrow") # noqa: E731 return tuple( map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)]) @@ -1107,6 +1108,7 @@ def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Se modin_series = modin_series.sort_values().reset_index(drop=True) pandas_series = pandas_series.sort_values().reset_index(drop=True) + # TODO: REVERT ME modin_series = modin_series.convert_dtypes(dtype_backend="pyarrow") pandas_series = pandas_series.convert_dtypes(dtype_backend="pyarrow") return modin_series, pandas_series From afae62fd4e2e239e15c7c656148249d20c5c723d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 2 May 2024 17:52:40 +0200 Subject: [PATCH 12/50] fix Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/binary.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index 5138d728aba..019d7280289 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -260,9 +260,9 @@ def try_compute_new_dtypes( if any("pyarrow" in str(x) for x in first.dtypes) or any( "pyarrow" in str(x) for x in second.dtypes ): - backend = "pyarrow" + backend = "[pyarrow]" dtypes = maybe_build_dtypes_series( - first, second, dtype=pandas.api.types.pandas_dtype(f"bool[{backend}]") + first, second, dtype=pandas.api.types.pandas_dtype(f"bool{backend}") ) elif infer_dtypes == "common_cast": dtypes = maybe_compute_dtypes_common_cast( From 316cddb4edbaf1214f4f696868f9ccd1bf0bb0e0 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 2 May 2024 18:31:30 +0200 Subject: [PATCH 13/50] fix Signed-off-by: Anatoly Myachev --- modin/pandas/base.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 81dc3b03a46..013f90031bc 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -351,7 +351,7 @@ def _validate_other( if label in other ] else: - other_dtypes = [type(x) for x in other] + other_dtypes = [x if pandas.isna(x) else type(x) for x in other] if compare_index: if not self.index.equals(other.index): raise TypeError("Cannot perform operation with non-equal index") @@ -371,17 +371,18 @@ def _validate_other( # TODO(https://github.com/modin-project/modin/issues/5239): # this spuriously rejects other that is a list including some # custom type that can be added to self's elements. - if not all( - (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) - or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype)) - or ( - lib.is_np_dtype(self_dtype, "mM") - and lib.is_np_dtype(self_dtype, "mM") - ) - or is_dtype_equal(self_dtype, other_dtype) - for self_dtype, other_dtype in zip(self_dtypes, other_dtypes) - ): - raise TypeError("Cannot do operation with improper dtypes") + for self_dtype, other_dtype in zip(self_dtypes, other_dtypes): + if not ( + (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) + or (is_numeric_dtype(self_dtype) and pandas.isna(other_dtype)) + or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype)) + or ( + lib.is_np_dtype(self_dtype, "mM") + and lib.is_np_dtype(self_dtype, "mM") + ) + or is_dtype_equal(self_dtype, other_dtype) + ): + raise TypeError("Cannot do operation with improper dtypes") return result def _validate_function(self, func, on_invalid=None): From 639c2edbb3e4baf34d527ee5e5e163b4f41547ef Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 2 May 2024 20:39:35 +0200 Subject: [PATCH 14/50] fix pivot_table Signed-off-by: Anatoly Myachev --- modin/core/storage_formats/pandas/groupby.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modin/core/storage_formats/pandas/groupby.py b/modin/core/storage_formats/pandas/groupby.py index e327efbda4a..4b22b5c0158 100644 --- a/modin/core/storage_formats/pandas/groupby.py +++ b/modin/core/storage_formats/pandas/groupby.py @@ -15,6 +15,7 @@ import numpy as np import pandas +from pandas.core.dtypes.cast import find_common_type from modin.config import use_range_partitioning_groupby from modin.core.dataframe.algebra import GroupByReduce @@ -358,7 +359,9 @@ def applyier(df, other): # pragma: no cover # transposing it back to be consistent with column axis values along # different partitions if len(index) == 0 and len(columns) > 0: - result = result.transpose() + common_type = find_common_type(result.dtypes.tolist()) + # transpose loses dtypes: https://github.com/pandas-dev/pandas/issues/43337 + result = result.transpose().astype(common_type, copy=False) return result From 05f32e51d683c99c5b39e2920fe8630c993a9da6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 2 May 2024 22:32:08 +0200 Subject: [PATCH 15/50] fix Signed-off-by: Anatoly Myachev --- modin/tests/pandas/dataframe/test_default.py | 1 + modin/tests/pandas/utils.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 40e14eb55c9..0f3ca39fd72 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -634,6 +634,7 @@ def test_pivot(data, index, columns, values, request): expected_exception = ValueError( "Index contains duplicate entries, cannot reshape" ) + # failed because pandas doesn't preserve dtype backend eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: df.pivot(*args, **kwargs), diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py index b88a5d0a0b0..b04026393a9 100644 --- a/modin/tests/pandas/utils.py +++ b/modin/tests/pandas/utils.py @@ -662,9 +662,10 @@ def assert_dtypes_equal(df1, df2): lambda obj: isinstance(obj, pandas.PeriodDtype), ) - for col in dtypes1.keys(): + # `test_pivot_table_margins` failed due to usage ``pd.NA`` in column name + for idx in range(len(dtypes1)): for comparator in dtype_comparators: - if assert_all_act_same(comparator, dtypes1[col], dtypes2[col]): + if assert_all_act_same(comparator, dtypes1.iloc[idx], dtypes2.iloc[idx]): # We met a dtype that both types satisfy, so we can stop iterating # over comparators and compare next dtypes break From 194cc68c553c98d8acac5edd44c5905729ffaad2 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 3 May 2024 16:27:27 +0200 Subject: [PATCH 16/50] find potential problem areas at the query_compiler level Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/binary.py | 26 ++++-- modin/core/dataframe/algebra/map.py | 9 +- modin/core/dataframe/algebra/tree_reduce.py | 2 +- .../dataframe/pandas/dataframe/dataframe.py | 28 +++--- .../core/dataframe/pandas/metadata/dtypes.py | 29 ++++--- modin/core/dataframe/pandas/metadata/index.py | 5 +- .../storage_formats/base/query_compiler.py | 7 +- .../storage_formats/pandas/query_compiler.py | 85 ++++++++++++------- modin/pandas/utils.py | 1 + 9 files changed, 126 insertions(+), 66 deletions(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index 019d7280289..acc84460b0e 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -13,8 +13,10 @@ """Module houses builder class for Binary operator.""" +from __future__ import annotations + import warnings -from typing import Optional +from typing import TYPE_CHECKING, Optional import numpy as np import pandas @@ -24,9 +26,12 @@ from .operator import Operator +if TYPE_CHECKING: + from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler + def maybe_compute_dtypes_common_cast( - first, + first: PandasQueryCompiler, second, trigger_computations=False, axis=0, @@ -80,6 +85,7 @@ def maybe_compute_dtypes_common_cast( # belong to the intersection, these will be NaN columns in the result mismatch_columns = columns_first ^ columns_second elif isinstance(second, dict): + # TODO: pyarrow backend dtypes_second = { key: pandas.api.types.pandas_dtype(type(value)) for key, value in second.items() @@ -92,6 +98,7 @@ def maybe_compute_dtypes_common_cast( mismatch_columns = columns_first.difference(columns_second) else: if isinstance(second, (list, tuple)): + # TODO: pyarrow backend second_dtypes_list = ( [pandas.api.types.pandas_dtype(type(value)) for value in second] if axis == 1 @@ -100,6 +107,7 @@ def maybe_compute_dtypes_common_cast( else [np.array(second).dtype] * len(dtypes_first) ) elif is_scalar(second) or isinstance(second, np.ndarray): + # TODO: pyarrow backend try: dtype = getattr(second, "dtype", None) or pandas.api.types.pandas_dtype( type(second) @@ -125,6 +133,7 @@ def maybe_compute_dtypes_common_cast( mismatch_columns = [] # If at least one column doesn't match, the result of the non matching column would be nan. + # TODO: pyarrow backend nan_dtype = pandas.api.types.pandas_dtype(type(np.nan)) dtypes = None if func is not None: @@ -168,7 +177,7 @@ def maybe_compute_dtypes_common_cast( def maybe_build_dtypes_series( - first, second, dtype, trigger_computations=False + first: PandasQueryCompiler, second, dtype, trigger_computations=False ) -> Optional[pandas.Series]: """ Build a ``pandas.Series`` describing dtypes of the result of a binary operation. @@ -217,8 +226,13 @@ def maybe_build_dtypes_series( def try_compute_new_dtypes( - first, second, infer_dtypes=None, result_dtype=None, axis=0, func=None -): + first: PandasQueryCompiler, + second, + infer_dtypes=None, + result_dtype=None, + axis=0, + func=None, +) -> Optional[pandas.Series]: """ Precompute resulting dtypes of the binary operation if possible. @@ -235,7 +249,7 @@ def try_compute_new_dtypes( infer_dtypes : {"common_cast", "try_sample", "bool", None}, default: None How dtypes should be infered (see ``Binary.register`` doc for more info). result_dtype : np.dtype, optional - NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter. + NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter. Only NumPy? axis : int, default: 0 Axis to perform the binary operation along. func : callable(pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame, optional diff --git a/modin/core/dataframe/algebra/map.py b/modin/core/dataframe/algebra/map.py index 57b21f6e1b0..aefebe6c017 100644 --- a/modin/core/dataframe/algebra/map.py +++ b/modin/core/dataframe/algebra/map.py @@ -13,8 +13,15 @@ """Module houses builder class for Map operator.""" +from __future__ import annotations + +from typing import TYPE_CHECKING + from .operator import Operator +if TYPE_CHECKING: + from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler + class Map(Operator): """Builder class for Map operator.""" @@ -41,7 +48,7 @@ def register(cls, function, *call_args, **call_kwds): Function that takes query compiler and executes map function. """ - def caller(query_compiler, *args, **kwargs): + def caller(query_compiler: PandasQueryCompiler, *args, **kwargs): """Execute Map function against passed query compiler.""" shape_hint = call_kwds.pop("shape_hint", None) or query_compiler._shape_hint return query_compiler.__constructor__( diff --git a/modin/core/dataframe/algebra/tree_reduce.py b/modin/core/dataframe/algebra/tree_reduce.py index fa7b731e6f5..8a30196cbeb 100644 --- a/modin/core/dataframe/algebra/tree_reduce.py +++ b/modin/core/dataframe/algebra/tree_reduce.py @@ -35,7 +35,7 @@ def register( axis : int, optional Specifies axis to apply function along. compute_dtypes : callable(pandas.Series, *func_args, **func_kwargs) -> np.dtype, optional - Callable for computing dtypes. + Callable for computing dtypes. Only NumPy? Returns ------- diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index c0989b246c4..4a4bb8906d9 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -17,6 +17,9 @@ PandasDataframe is a parent abstract class for any dataframe class for pandas storage format. """ + +from __future__ import annotations + import datetime import re from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Optional, Union @@ -97,15 +100,18 @@ class PandasDataframe( # These properties flag whether or not we are deferring the metadata synchronization _deferred_index = False _deferred_column = False + _index_cache: ModinIndex = None + _columns_cache: ModinIndex = None + _dtypes: Optional[ModinDtypes] = None @pandas.util.cache_readonly - def __constructor__(self): + def __constructor__(self) -> Callable[..., PandasDataframe]: """ Create a new instance of this object. Returns ------- - PandasDataframe + callable """ return type(self) @@ -451,9 +457,6 @@ def dtype_builder(df): dtypes.name = None return dtypes - _index_cache = None - _columns_cache = None - def set_index_cache(self, index): """ Set index cache. @@ -2230,6 +2233,7 @@ def map( if isinstance(new_columns, ModinIndex): # Materializing lazy columns in order to build dtype's index new_columns = new_columns.get(return_lengths=False) + # TODO: consider backend dtypes = pandas.Series( [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns), index=new_columns, @@ -2894,7 +2898,7 @@ def apply_full_axis( enumerate_partitions : bool, default: False Whether pass partition index into applied `func` or not. Note that `func` must be able to obtain `partition_idx` kwarg. - dtypes : list-like, optional + dtypes : list-like or scalar, optional The data types of the result. This is an optimization because there are functions that always result in a particular data type, and allows us to avoid (re)computing it. @@ -2948,7 +2952,7 @@ def apply_full_axis_select_indices( new_index=None, new_columns=None, keep_remaining=False, - new_dtypes=None, + new_dtypes: Optional[Union[pandas.Series, ModinDtypes]] = None, ): """ Apply a function across an entire axis for a subset of the data. @@ -3017,10 +3021,10 @@ def apply_select_indices( col_labels=None, new_index=None, new_columns=None, - new_dtypes=None, + new_dtypes: Optional[pandas.Series] = None, keep_remaining=False, item_to_distribute=no_default, - ): + ) -> PandasDataframe: """ Apply a function for a subset of the data. @@ -3405,12 +3409,12 @@ def broadcast_apply_full_axis( new_columns : list-like, optional Columns of the result. We may know this in advance, and if not provided it must be computed. - apply_indices : list-like, default: None + apply_indices : list-like, optional Indices of `axis ^ 1` to apply function over. enumerate_partitions : bool, default: False Whether pass partition index into applied `func` or not. Note that `func` must be able to obtain `partition_idx` kwarg. - dtypes : list-like, default: None + dtypes : list-like or scalar, optional Data types of the result. This is an optimization because there are functions that always result in a particular data type, and allows us to avoid (re)computing it. @@ -3486,6 +3490,7 @@ def broadcast_apply_full_axis( if new_columns is None: kw["dtypes"] = ModinDtypes( DtypesDescriptor( + # TODO: pyarrow backend remaining_dtype=pandas.api.types.pandas_dtype(dtypes) ) ) @@ -3494,6 +3499,7 @@ def broadcast_apply_full_axis( pandas.Series(dtypes, index=new_columns) if is_list_like(dtypes) else pandas.Series( + # TODO: pyarrow backend [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns), index=new_columns, ) diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index 88f575d7288..ec227cde32f 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -13,6 +13,8 @@ """Module contains class ``ModinDtypes``.""" +from __future__ import annotations + from typing import TYPE_CHECKING, Callable, Optional, Union import numpy as np @@ -62,6 +64,7 @@ def __init__( self, known_dtypes: Optional[Union[dict[IndexLabel, np.dtype], pandas.Series]] = None, cols_with_unknown_dtypes: Optional[list[IndexLabel]] = None, + # TODO: what if there is a type of another backend remaining_dtype: Optional[np.dtype] = None, parent_df: Optional["PandasDataframe"] = None, columns_order: Optional[dict[int, IndexLabel]] = None, @@ -747,9 +750,7 @@ class ModinDtypes: def __init__( self, - value: Optional[ - Union[Callable, pandas.Series, DtypesDescriptor, "ModinDtypes"] - ], + value: Optional[Union[Callable, pandas.Series, DtypesDescriptor, ModinDtypes]], ): if callable(value) or isinstance(value, pandas.Series): self._value = value @@ -779,6 +780,7 @@ def is_materialized(self) -> bool: """ return isinstance(self._value, pandas.Series) + # TODO: pyarrow backend def get_dtypes_set(self) -> set[np.dtype]: """ Get a set of dtypes from the descriptor. @@ -793,9 +795,7 @@ def get_dtypes_set(self) -> set[np.dtype]: self.get() return set(self._value.values) - def maybe_specify_new_frame_ref( - self, new_parent: "PandasDataframe" - ) -> "ModinDtypes": + def maybe_specify_new_frame_ref(self, new_parent: PandasDataframe) -> ModinDtypes: """ Set a new parent for the stored value if needed. @@ -817,7 +817,7 @@ def maybe_specify_new_frame_ref( return new_self return new_self - def lazy_get(self, ids: list, numeric_index: bool = False) -> "ModinDtypes": + def lazy_get(self, ids: list, numeric_index: bool = False) -> ModinDtypes: """ Get new ``ModinDtypes`` for a subset of columns without triggering any computations. @@ -849,7 +849,7 @@ def lazy_get(self, ids: list, numeric_index: bool = False) -> "ModinDtypes": return ModinDtypes(self._value.iloc[ids] if numeric_index else self._value[ids]) @classmethod - def concat(cls, values: list, axis: int = 0) -> "ModinDtypes": + def concat(cls, values: list, axis: int = 0) -> ModinDtypes: """ Concatenate dtypes. @@ -893,7 +893,7 @@ def concat(cls, values: list, axis: int = 0) -> "ModinDtypes": desc = pandas.concat(values) return ModinDtypes(desc) - def set_index(self, new_index: Union[pandas.Index, "ModinIndex"]) -> "ModinDtypes": + def set_index(self, new_index: Union[pandas.Index, ModinIndex]) -> ModinDtypes: """ Set new column names for stored dtypes. @@ -997,7 +997,7 @@ def __getattr__(self, name): self.get() return self._value.__getattribute__(name) - def copy(self) -> "ModinDtypes": + def copy(self) -> ModinDtypes: """ Copy an object without materializing the internal representation. @@ -1235,13 +1235,22 @@ def extract_dtype(value): from modin.pandas.utils import is_scalar if hasattr(value, "dtype"): + # If we're dealing with a numpy scalar (np.int, np.datetime64, ...) + # we would like to get its internal dtype return value.dtype + elif hasattr(value, "to_numpy"): + # If we're dealing with a scalar that can be converted to numpy (for example pandas.Timestamp) + # we would like to convert it and get its proper internal dtype + return value.to_numpy().dtype elif hasattr(value, "dtypes"): return value.dtypes elif is_scalar(value): if value is None: # previous type was object instead of 'float64' return pandas.api.types.pandas_dtype(value) + # TODO: backend is not taken into account + # pd.api.types.pandas_dtype(pd.ArrowDtype(pa.array([1,2,3]).type)) return pandas.api.types.pandas_dtype(type(value)) else: + # TODO: new way without numpy? return np.array(value).dtype diff --git a/modin/core/dataframe/pandas/metadata/index.py b/modin/core/dataframe/pandas/metadata/index.py index d5aa37e52a0..b731a99bc73 100644 --- a/modin/core/dataframe/pandas/metadata/index.py +++ b/modin/core/dataframe/pandas/metadata/index.py @@ -15,6 +15,7 @@ import functools import uuid +from typing import Optional import pandas from pandas.core.dtypes.common import is_list_like @@ -44,7 +45,7 @@ class ModinIndex: Materialized dtypes of index levels. """ - def __init__(self, value=None, axis=None, dtypes=None): + def __init__(self, value=None, axis=None, dtypes: Optional[pandas.Series] = None): from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe self._is_default_callable = False @@ -69,7 +70,7 @@ def __init__(self, value=None, axis=None, dtypes=None): self._index_id = uuid.uuid4() self._lengths_id = uuid.uuid4() - def maybe_get_dtypes(self): + def maybe_get_dtypes(self) -> Optional[pandas.Series]: """ Get index dtypes if available. diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index ae78bc27f09..c6e09d00dfb 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -17,9 +17,11 @@ ``BaseQueryCompiler`` is a parent abstract class for any other query compiler class. """ +from __future__ import annotations + import abc import warnings -from typing import Hashable, List, Optional +from typing import Callable, Hashable, List, Optional import numpy as np import pandas @@ -4279,6 +4281,7 @@ def get_positions_from_labels(self, row_loc, col_loc): # `Index.get_indexer_for` works much faster with numpy arrays than with python lists, # so although we lose some time here on converting to numpy, `Index.get_indexer_for` # speedup covers the loss that we gain here. + # TODO: pyarrow backend? axis_loc = np.array(axis_loc, dtype=axis_labels.dtype) axis_lookup = axis_labels.get_indexer_for(axis_loc) # `Index.get_indexer_for` sets -1 value for missing labels, we have to verify whether @@ -4456,7 +4459,7 @@ def write_items(df, broadcasted_items): # END Abstract methods for QueryCompiler @pandas.util.cache_readonly - def __constructor__(self): + def __constructor__(self) -> Callable[..., BaseQueryCompiler]: """ Get query compiler constructor. diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 060e7849768..b139ad4ff56 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -18,12 +18,14 @@ queries for the ``PandasDataframe``. """ +from __future__ import annotations + import ast import hashlib import re import warnings from collections.abc import Iterable -from typing import Hashable, List +from typing import TYPE_CHECKING, Hashable, List, Optional import numpy as np import pandas @@ -37,6 +39,7 @@ is_datetime64_any_dtype, is_list_like, is_numeric_dtype, + is_timedelta64_dtype, ) from pandas.core.groupby.base import transformation_kernels from pandas.core.indexes.api import ensure_index_from_sequences @@ -79,6 +82,9 @@ from .merge import MergeImpl from .utils import get_group_names, merge_partitioning +if TYPE_CHECKING: + from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe + def _get_axis(axis): """ @@ -263,7 +269,10 @@ class PandasQueryCompiler(BaseQueryCompiler): Shape hint for frames known to be a column or a row, otherwise None. """ - def __init__(self, modin_frame, shape_hint=None): + _modin_frame: PandasDataframe + _shape_hint: Optional[str] + + def __init__(self, modin_frame: PandasDataframe, shape_hint: Optional[str] = None): self._modin_frame = modin_frame self._shape_hint = shape_hint @@ -935,6 +944,7 @@ def compute_dtypes_fn(dtypes, axis, **kwargs): and any(is_numeric_dtype(t) for t in dtypes) ): return "object" + # how to take into account backend here? return "float64" return TreeReduce.register( @@ -1846,39 +1856,41 @@ def isin_func(df, values): abs = Map.register(pandas.DataFrame.abs, dtypes="copy") map = Map.register(pandas.DataFrame.map) + # Will it work with pyarrow backend? conj = Map.register(lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df))) convert_dtypes = Fold.register(pandas.DataFrame.convert_dtypes) invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy") isna = Map.register(pandas.DataFrame.isna, dtypes="bool") + # better way to distinguish methods for NumPy API? _isfinite = Map.register( lambda df, *args, **kwargs: pandas.DataFrame(np.isfinite(df, *args, **kwargs)), - dtypes=np.bool_, + dtypes="bool", ) _isinf = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isinf(df, *args, **kwargs)), - dtypes=np.bool_, + dtypes="bool", ) _isnat = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isnat(df, *args, **kwargs)), - dtypes=np.bool_, + dtypes="bool", ) _isneginf = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isneginf(df, *args, **kwargs)), - dtypes=np.bool_, + dtypes="bool", ) _isposinf = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isposinf(df, *args, **kwargs)), - dtypes=np.bool_, + dtypes="bool", ) _iscomplex = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.iscomplex(df, *args, **kwargs)), - dtypes=np.bool_, + dtypes="bool", ) _isreal = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isreal(df, *args, **kwargs)), - dtypes=np.bool_, + dtypes="bool", ) - _logical_not = Map.register(np.logical_not, dtypes=np.bool_) # Needed for numpy API + _logical_not = Map.register(np.logical_not, dtypes="bool") # Needed for numpy API _tanh = Map.register( lambda df, *args, **kwargs: pandas.DataFrame(np.tanh(df, *args, **kwargs)) ) # Needed for numpy API @@ -2122,6 +2134,7 @@ def astype(self, col_dtypes, errors: str = "raise"): # other query compilers may not take care of error handling at the API # layer. This query compiler assumes there won't be any errors due to # invalid type keys. + # Function that can change the backend return self.__constructor__( self._modin_frame.astype(col_dtypes, errors=errors), shape_hint=self._shape_hint, @@ -2280,6 +2293,7 @@ def map_func(df): # pragma: no cover """Compute covariance or correlation matrix for the passed frame.""" df = df.to_numpy() n_rows = df.shape[0] + # Does it work with pyarrow backend? df_mask = np.isfinite(df) result = np.empty((n_rows, n_cols), dtype="float64") @@ -2604,7 +2618,11 @@ def quantile_for_list_of_values(self, **kwargs): new_columns = [ col for col, dtype in zip(self.columns, self.dtypes) - if (is_numeric_dtype(dtype) or lib.is_np_dtype(dtype, "mM")) + if ( + is_numeric_dtype(dtype) + or is_timedelta64_dtype(dtype) + or is_datetime64_any_dtype(dtype) + ) ] if axis == 1: query_compiler = self.getitem_column_array(new_columns) @@ -2799,13 +2817,14 @@ def applyier(df, internal_indices, other=[], internal_other_indices=[]): # __getitem__ methods __getitem_bool = Binary.register( + # TODO: `is_scalar` don't work with pyarrow scalars lambda df, r: df[[r]] if is_scalar(r) else df[r], join_type="left", labels="drop", ) # __setitem__ methods - def setitem_bool(self, row_loc, col_loc, item): + def setitem_bool(self, row_loc: PandasQueryCompiler, col_loc, item): def _set_item(df, row_loc): # pragma: no cover df = df.copy() df.loc[row_loc.squeeze(axis=1), col_loc] = item @@ -2814,18 +2833,7 @@ def _set_item(df, row_loc): # pragma: no cover if self._modin_frame.has_materialized_dtypes and is_scalar(item): new_dtypes = self.dtypes.copy() old_dtypes = new_dtypes[col_loc] - - if hasattr(item, "dtype"): - # If we're dealing with a numpy scalar (np.int, np.datetime64, ...) - # we would like to get its internal dtype - item_type = item.dtype - elif hasattr(item, "to_numpy"): - # If we're dealing with a scalar that can be converted to numpy (for example pandas.Timestamp) - # we would like to convert it and get its proper internal dtype - item_type = item.to_numpy().dtype - else: - item_type = pandas.api.types.pandas_dtype(type(item)) - + item_type = extract_dtype(item) if isinstance(old_dtypes, pandas.Series): new_dtypes[col_loc] = [ find_common_type([dtype, item_type]) for dtype in old_dtypes.values @@ -2893,7 +2901,9 @@ def getitem_array(self, key): ) return self.getitem_column_array(key) - def getitem_column_array(self, key, numeric=False, ignore_order=False): + def getitem_column_array( + self, key, numeric=False, ignore_order=False + ) -> PandasQueryCompiler: shape_hint = "column" if len(key) == 1 else None if numeric: if ignore_order and is_list_like(key): @@ -3053,6 +3063,7 @@ def mapper(df: pandas.DataFrame): ) # we have to keep other columns so setting their mask # values with `False` + # TODO: pyarrow backend? mask = pandas.Series( np.zeros(df.shape[1], dtype=bool), index=df.columns ) @@ -3105,7 +3116,9 @@ def reduce(df: pandas.DataFrame, mask: pandas.DataFrame): shape_hint=self._shape_hint, ) - def drop(self, index=None, columns=None, errors: str = "raise"): + def drop( + self, index=None, columns=None, errors: str = "raise" + ) -> PandasQueryCompiler: # `errors` parameter needs to be part of the function signature because # other query compilers may not take care of error handling at the API # layer. This query compiler assumes there won't be any errors due to @@ -3152,7 +3165,8 @@ def _compute_duplicated(df): # pragma: no cover hashed_modin_frame = self._modin_frame.reduce( axis=1, function=_compute_hash, - dtypes=pandas.api.types.pandas_dtype("O"), + # TODO: pyarrow backend + dtypes="object", ) else: hashed_modin_frame = self._modin_frame @@ -3906,7 +3920,7 @@ def agg_func(grp, *args, **kwargs): add_missing_cats=add_missing_cats, **groupby_kwargs, ) - result_qc = self.__constructor__(result) + result_qc: PandasQueryCompiler = self.__constructor__(result) if not is_transform and not groupby_kwargs.get("as_index", True): return result_qc.reset_index(drop=True) @@ -4440,14 +4454,15 @@ def map_fn(df): # pragma: no cover # efficient if we are mapping over all of the data to do it this way # than it would be to reuse the code for specific columns. if len(columns) == len(self.columns): + # TODO: pyarrow backend new_modin_frame = self._modin_frame.apply_full_axis( - 0, map_fn, new_index=self.index, dtypes=bool + 0, map_fn, new_index=self.index, dtypes="bool" ) untouched_frame = None else: new_modin_frame = self._modin_frame.take_2d_labels_or_positional( col_labels=columns - ).apply_full_axis(0, map_fn, new_index=self.index, dtypes=bool) + ).apply_full_axis(0, map_fn, new_index=self.index, dtypes="bool") untouched_frame = self.drop(columns=columns) # If we mapped over all the data we are done. If not, we need to # prepend the `new_modin_frame` with the raw data from the columns that were @@ -4496,10 +4511,11 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item): pandas.DataFrame Partition data with updated values. """ - partition = partition.copy() try: partition.iloc[row_internal_indices, col_internal_indices] = item except ValueError: + # maybe make a copy only if there is an exception? + partition = partition.copy() # `copy` is needed to avoid "ValueError: buffer source array is read-only" for `item` # because the item may be converted to the type that is in the dataframe. # TODO: in the future we will need to convert to the correct type manually according @@ -4519,7 +4535,7 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item): ) else: broadcasted_item, broadcasted_dtypes = item, pandas.Series( - [np.array(item).dtype] * len(col_numeric_index) + [extract_dtype(item)] * len(col_numeric_index) ) new_dtypes = None @@ -4572,7 +4588,10 @@ def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): def cat_codes(self): def func(df: pandas.DataFrame) -> pandas.DataFrame: ser = df.iloc[:, 0] - assert isinstance(ser.dtype, pandas.CategoricalDtype) + if not isinstance(ser.dtype, pandas.CategoricalDtype): + raise TypeError( + f"Series dtype should be `CategoricalDtype`: actual dtype: {ser.dtype}" + ) return ser.cat.codes.to_frame(name=MODIN_UNNAMED_SERIES_LABEL) res = self._modin_frame.map(func=func, new_columns=[MODIN_UNNAMED_SERIES_LABEL]) diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 03a9078b0ee..3a9ad6aa13d 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -297,6 +297,7 @@ def broadcast_item( try: # Cast to numpy drop information about heterogeneous types (cast to common) # TODO: we shouldn't do that, maybe there should be the if branch + # TODO: what if item comes from pyarrow item = np.array(item) if dtypes is None: dtypes = pandas.Series([item.dtype] * len(col_lookup)) From 91f2607a94d305f891273b177bc218d2d8d7f8a9 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 3 May 2024 18:23:50 +0200 Subject: [PATCH 17/50] some more places Signed-off-by: Anatoly Myachev --- .../dataframe/pandas/dataframe/dataframe.py | 108 ++++++++++-------- .../core/dataframe/pandas/metadata/dtypes.py | 54 ++++----- .../storage_formats/pandas/aggregations.py | 2 + modin/pandas/dataframe.py | 5 + modin/tests/pandas/dataframe/test_reduce.py | 1 + 5 files changed, 95 insertions(+), 75 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 4a4bb8906d9..78d297b11b0 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -56,6 +56,9 @@ from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( ProtocolDataframe, ) + from modin.core.dataframe.pandas.partitioning.partition_manager import ( + PandasDataframePartitionManager, + ) from pandas._typing import npt from modin.logging import ClassLogger @@ -95,7 +98,7 @@ class PandasDataframe( The data types for the dataframe columns. """ - _partition_mgr_cls = None + _partition_mgr_cls: PandasDataframePartitionManager = None _query_compiler_cls = PandasQueryCompiler # These properties flag whether or not we are deferring the metadata synchronization _deferred_index = False @@ -122,7 +125,7 @@ def __init__( columns=None, row_lengths=None, column_widths=None, - dtypes=None, + dtypes: Optional[Union[pandas.Series, ModinDtypes, Callable]] = None, ): self._partitions = partitions self.set_index_cache(index) @@ -412,13 +415,13 @@ def get_dtypes_set(self): return self._dtypes.get_dtypes_set() return set(self.dtypes.values) - def _compute_dtypes(self, columns=None): + def _compute_dtypes(self, columns=None) -> pandas.Series: """ Compute the data types via TreeReduce pattern for the specified columns. Parameters ---------- - columns : list-like, default: None + columns : list-like, optional Columns to compute dtypes for. If not specified compute dtypes for all the columns in the dataframe. @@ -862,7 +865,7 @@ def synchronize_labels(self, axis=None): Parameters ---------- - axis : int, default: None + axis : int, optional The deferred axis. 0 for the index, 1 for the columns. """ @@ -883,7 +886,7 @@ def _propagate_index_objs(self, axis=None): Parameters ---------- - axis : int, default: None + axis : int, optional The axis to apply to. If it's None applies to both axes. """ self._filter_empties(compute_metadata=False) @@ -987,7 +990,7 @@ def take_2d_labels_or_positional( row_positions: Optional[List[int]] = None, col_labels: Optional[List[Hashable]] = None, col_positions: Optional[List[int]] = None, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Lazily select columns or rows from given indices. @@ -1143,7 +1146,7 @@ def _take_2d_positional( self, row_positions: Optional[List[int]] = None, col_positions: Optional[List[int]] = None, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Lazily select columns or rows from given indices. @@ -1323,10 +1326,10 @@ def _take_2d_positional( def _maybe_reorder_labels( self, - intermediate: "PandasDataframe", + intermediate: PandasDataframe, row_positions, col_positions, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Call re-order labels on take_2d_labels_or_positional result if necessary. @@ -1390,7 +1393,7 @@ def _maybe_reorder_labels( ) @lazy_metadata_decorator(apply_axis="rows") - def from_labels(self) -> "PandasDataframe": + def from_labels(self) -> PandasDataframe: """ Convert the row labels to a column of data, inserted at the first position. @@ -1492,7 +1495,7 @@ def from_labels_executor(df, **kwargs): result.synchronize_labels(axis=0) return result - def to_labels(self, column_list: List[Hashable]) -> "PandasDataframe": + def to_labels(self, column_list: List[Hashable]) -> PandasDataframe: """ Move one or more columns into the row labels. Previous labels are dropped. @@ -1665,6 +1668,7 @@ def astype(self, col_dtypes, errors: str = "raise"): if new_dtypes is None: new_dtypes = self_dtypes.copy() # Update the new dtype series to the proper pandas dtype + # TODO: pyarrow backend? new_dtype = pandas.api.types.pandas_dtype(dtype) if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 @@ -1694,6 +1698,7 @@ def astype_builder(df): # Assume that the dtype is a scalar. if not (col_dtypes == self_dtypes).all(): new_dtypes = self_dtypes.copy() + # TODO: pyarrow backend? new_dtype = pandas.api.types.pandas_dtype(col_dtypes) if Engine.get() == "Dask" and hasattr(new_dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 @@ -1935,7 +1940,7 @@ def _join_index_objects(axis, indexes, how, sort, fill_value=None): considered to be the first index in the `indexes` list. sort : boolean Whether or not to sort the joined index. - fill_value : any, default: None + fill_value : any, optional Value to use for missing values. Returns @@ -2084,6 +2089,7 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None): if dtypes == "copy": dtypes = self.copy_dtypes_cache() elif dtypes is not None: + # TODO: pyarrow backend? dtypes = pandas.Series( [pandas.api.types.pandas_dtype(dtypes)] * len(new_axes[1]), index=new_axes[1], @@ -2103,7 +2109,7 @@ def reduce( axis: Union[int, Axis], function: Callable, dtypes: Optional[str] = None, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Perform a user-defined aggregation on the specified axis, where the axis reduces down to a singleton. Requires knowledge of the full axis for the reduction. @@ -2141,7 +2147,7 @@ def tree_reduce( map_func: Callable, reduce_func: Optional[Callable] = None, dtypes: Optional[str] = None, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Apply function that will reduce the data to a pandas Series. @@ -2188,7 +2194,7 @@ def map( func_args=None, func_kwargs=None, lazy=False, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Perform a function that maps across the entire dataset. @@ -2253,7 +2259,7 @@ def window( reduce_fn: Callable, window_size: int, result_schema: Optional[Dict[Hashable, type]] = None, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Apply a sliding window operator that acts as a GROUPBY on each window, and reduces down to a single row (column) per window. @@ -2326,7 +2332,7 @@ def fold(self, axis, func, new_columns=None): self._column_widths_cache, ) - def infer_objects(self) -> "PandasDataframe": + def infer_objects(self) -> PandasDataframe: """ Attempt to infer better dtypes for object columns. @@ -2344,7 +2350,7 @@ def infer_objects(self) -> "PandasDataframe": ] return self.infer_types(obj_cols) - def infer_types(self, col_labels: List[str]) -> "PandasDataframe": + def infer_types(self, col_labels: List[str]) -> PandasDataframe: """ Determine the compatible type shared by all values in the specified columns, and coerce them to that type. @@ -2378,7 +2384,7 @@ def join( condition: Callable, other: ModinDataframe, join_type: Union[str, JoinType], - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Join this dataframe with the other. @@ -2414,7 +2420,7 @@ def rename( self, new_row_labels: Optional[Union[Dict[Hashable, Hashable], Callable]] = None, new_col_labels: Optional[Union[Dict[Hashable, Hashable], Callable]] = None, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Replace the row and column labels with the specified new labels. @@ -2670,7 +2676,7 @@ def sort_by( columns: Union[str, List[str]], ascending: bool = True, **kwargs, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Logically reorder rows (columns if axis=1) lexicographically by the data in a column or set of columns. @@ -2738,7 +2744,7 @@ def sort_function(df): # pragma: no cover return result @lazy_metadata_decorator(apply_axis="both") - def filter(self, axis: Union[Axis, int], condition: Callable) -> "PandasDataframe": + def filter(self, axis: Union[Axis, int], condition: Callable) -> PandasDataframe: """ Filter data based on the function provided along an entire axis. @@ -2780,7 +2786,7 @@ def filter(self, axis: Union[Axis, int], condition: Callable) -> "PandasDatafram self.copy_dtypes_cache() if axis == Axis.COL_WISE else None, ) - def filter_by_types(self, types: List[Hashable]) -> "PandasDataframe": + def filter_by_types(self, types: List[Hashable]) -> PandasDataframe: """ Allow the user to specify a type or set of types by which to filter the columns. @@ -2799,7 +2805,7 @@ def filter_by_types(self, types: List[Hashable]) -> "PandasDataframe": ) @lazy_metadata_decorator(apply_axis="both") - def explode(self, axis: Union[int, Axis], func: Callable) -> "PandasDataframe": + def explode(self, axis: Union[int, Axis], func: Callable) -> PandasDataframe: """ Explode list-like entries along an entire axis. @@ -2834,7 +2840,7 @@ def explode(self, axis: Union[int, Axis], func: Callable) -> "PandasDataframe": partitions, new_index, new_columns, row_lengths, column_widths ) - def combine(self) -> "PandasDataframe": + def combine(self) -> PandasDataframe: """ Create a single partition PandasDataframe from the partitions of the current dataframe. @@ -2893,7 +2899,7 @@ def apply_full_axis( new_columns : list-like, optional The columns of the result. We may know this in advance, and if not provided it must be computed. - apply_indices : list-like, default: None + apply_indices : list-like, optional Indices of `axis ^ 1` to apply function over. enumerate_partitions : bool, default: False Whether pass partition index into applied `func` or not. @@ -2963,9 +2969,9 @@ def apply_full_axis_select_indices( The axis to apply over. func : callable The function to apply. - apply_indices : list-like, default: None + apply_indices : list-like, optional The labels to apply over. - numeric_indices : list-like, default: None + numeric_indices : list-like, optional The indices to apply over. new_index : list-like, optional The index of the result. We may know this in advance, @@ -3034,12 +3040,12 @@ def apply_select_indices( The axis to apply over. func : callable The function to apply. - apply_indices : list-like, default: None + apply_indices : list-like, optional The labels to apply over. Must be given if axis is provided. - row_labels : list-like, default: None + row_labels : list-like, optional The row labels to apply over. Must be provided with `col_labels` to apply over both axes. - col_labels : list-like, default: None + col_labels : list-like, optional The column labels to apply over. Must be provided with `row_labels` to apply over both axes. new_index : list-like, optional @@ -3166,7 +3172,7 @@ def broadcast_apply( labels : {"keep", "replace", "drop"}, default: "keep" Whether keep labels from `self` Modin DataFrame, replace them with labels from joined DataFrame or drop altogether to make them be computed lazily later. - dtypes : "copy", pandas.Series or None, default: None + dtypes : "copy", pandas.Series or None, optional Dtypes of the result. "copy" to keep old dtypes and None to compute them on demand. Returns @@ -3318,9 +3324,9 @@ def broadcast_apply_select_indices( Function to apply. other : PandasDataframe Partitions of which should be broadcasted. - apply_indices : list, default: None + apply_indices : list, optional List of labels to apply (if `numeric_indices` are not specified). - numeric_indices : list, default: None + numeric_indices : list, optional Numeric indices to apply (if `apply_indices` are not specified). keep_remaining : bool, default: False Whether drop the data that is not computed over or not. @@ -3488,6 +3494,8 @@ def broadcast_apply_full_axis( kw["dtypes"] = dtypes.copy() else: if new_columns is None: + assert not is_list_like(dtypes) + # need something like this utility: construct_dtype() kw["dtypes"] = ModinDtypes( DtypesDescriptor( # TODO: pyarrow backend @@ -3572,7 +3580,7 @@ def broadcast_apply_full_axis( result.synchronize_labels(axis=1) return result - def _check_if_axes_identical(self, other: "PandasDataframe", axis: int = 0) -> bool: + def _check_if_axes_identical(self, other: PandasDataframe, axis: int = 0) -> bool: """ Check whether indices/partitioning along the specified `axis` are identical when compared with `other`. @@ -3626,7 +3634,7 @@ def _copartition( this method will skip repartitioning if it is possible. This is because reindexing is extremely inefficient. Because this method is used to `join` or `append`, it is vital that the internal indices match. - fill_value : any, default: None + fill_value : any, optional Value to use for missing values. Returns @@ -3748,7 +3756,7 @@ def n_ary_op( join_type="outer", copartition_along_columns=True, labels="replace", - dtypes=None, + dtypes: Optional[pandas.Series] = None, ): """ Perform an n-opary operation by joining with other Modin DataFrame(s). @@ -3767,7 +3775,7 @@ def n_ary_op( labels : {"replace", "drop"}, default: "replace" Whether use labels from joined DataFrame or drop altogether to make them be computed lazily later. - dtypes : series, default: None + dtypes : pandas.Series, optional Dtypes of the resultant dataframe, this argument will be received if the resultant dtypes of n-opary operation is precomputed. @@ -3837,10 +3845,10 @@ def n_ary_op( def concat( self, axis: Union[int, Axis], - others: Union["PandasDataframe", List["PandasDataframe"]], + others: Union[PandasDataframe, List[PandasDataframe]], how, sort, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Concatenate `self` with one or more other Modin DataFrames. @@ -3918,6 +3926,7 @@ def _compute_new_widths(): new_index = self.index.append([other.index for other in others]) new_columns = joined_index frames = [self] + others + # TODO: should we wrap all `concat` call into "try except" block? new_dtypes = ModinDtypes.concat([frame._dtypes for frame in frames], axis=1) # If we have already cached the length of each row in at least one # of the row's partitions, we can build new_lengths for the new @@ -3961,7 +3970,13 @@ def _compute_new_widths(): ) def _apply_func_to_range_partitioning_broadcast( - self, right, func, key, new_index=None, new_columns=None, new_dtypes=None + self, + right, + func, + key, + new_index=None, + new_columns=None, + new_dtypes: Optional[Union[ModinDtypes, pandas.Series]] = None, ): """ Apply `func` against two dataframes using range-partitioning implementation. @@ -4033,7 +4048,7 @@ def groupby( self, axis: Union[int, Axis], internal_by: List[str], - external_by: List["PandasDataframe"], + external_by: List[PandasDataframe], by_positions: List[int], operator: Callable, result_schema: Optional[Dict[Hashable, type]] = None, @@ -4041,7 +4056,7 @@ def groupby( series_groupby: bool = False, add_missing_cats: bool = False, **kwargs: dict, - ) -> "PandasDataframe": + ) -> PandasDataframe: """ Generate groups based on values in the input column(s) and perform the specified operation on each. @@ -4424,7 +4439,7 @@ def groupby_reduce( new_columns : pandas.Index, optional Columns of the result. We may know this in advance, and if not provided it must be computed. - apply_indices : list-like, default: None + apply_indices : list-like, optional Indices of `axis ^ 1` to apply groupby over. Returns @@ -4534,6 +4549,7 @@ def _arrow_type_to_dtype(cls, arrow_type): import pyarrow try: + # TODO: should we map arrow types to pyarrow-backed pandas types? res = arrow_type.to_pandas_dtype() # Conversion to pandas is not implemented for some arrow types, # perform manual conversion for them: @@ -4692,7 +4708,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): ) @classmethod - def from_dataframe(cls, df: "ProtocolDataframe") -> "PandasDataframe": + def from_dataframe(cls, df: "ProtocolDataframe") -> PandasDataframe: """ Convert a DataFrame implementing the dataframe exchange protocol to a Core Modin Dataframe. diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index ec227cde32f..c7979704db2 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -19,7 +19,7 @@ import numpy as np import pandas -from pandas._typing import IndexLabel +from pandas._typing import DtypeObj, IndexLabel from pandas.core.dtypes.cast import find_common_type if TYPE_CHECKING: @@ -35,13 +35,13 @@ class DtypesDescriptor: Parameters ---------- - known_dtypes : dict[IndexLabel, np.dtype] or pandas.Series, optional + known_dtypes : dict[IndexLabel, DtypeObj] or pandas.Series, optional Columns that we know dtypes for. cols_with_unknown_dtypes : list[IndexLabel], optional Column names that have unknown dtypes. If specified together with `remaining_dtype`, must describe all columns with unknown dtypes, otherwise, the missing columns will be assigned to `remaining_dtype`. If `cols_with_unknown_dtypes` is incomplete, you must specify `know_all_names=False`. - remaining_dtype : np.dtype, optional + remaining_dtype : DtypeObj, optional Dtype for columns that are not present neither in `known_dtypes` nor in `cols_with_unknown_dtypes`. This parameter is intended to describe columns that we known dtypes for, but don't know their names yet. Note, that this parameter DOESN'T describe dtypes for columns from `cols_with_unknown_dtypes`. @@ -62,11 +62,10 @@ class DtypesDescriptor: def __init__( self, - known_dtypes: Optional[Union[dict[IndexLabel, np.dtype], pandas.Series]] = None, + known_dtypes: Optional[Union[dict[IndexLabel, DtypeObj], pandas.Series]] = None, cols_with_unknown_dtypes: Optional[list[IndexLabel]] = None, - # TODO: what if there is a type of another backend - remaining_dtype: Optional[np.dtype] = None, - parent_df: Optional["PandasDataframe"] = None, + remaining_dtype: Optional[DtypeObj] = None, + parent_df: Optional[PandasDataframe] = None, columns_order: Optional[dict[int, IndexLabel]] = None, know_all_names: bool = True, _schema_is_known: Optional[bool] = None, @@ -76,7 +75,7 @@ def __init__( "It's not allowed to pass 'remaining_dtype' and 'know_all_names=False' at the same time." ) # columns with known dtypes - self._known_dtypes: dict[IndexLabel, np.dtype] = ( + self._known_dtypes: dict[IndexLabel, DtypeObj] = ( {} if known_dtypes is None else dict(known_dtypes) ) if known_dtypes is not None and len(self._known_dtypes) != len(known_dtypes): @@ -109,7 +108,7 @@ def __init__( self._know_all_names: bool = know_all_names # a common dtype for columns that are not present in 'known_dtypes' nor in 'cols_with_unknown_dtypes' - self._remaining_dtype: Optional[np.dtype] = remaining_dtype + self._remaining_dtype: Optional[DtypeObj] = remaining_dtype self._parent_df: Optional["PandasDataframe"] = parent_df if columns_order is None: self._columns_order: Optional[dict[int, IndexLabel]] = None @@ -135,7 +134,7 @@ def __init__( ) self._columns_order: Optional[dict[int, IndexLabel]] = columns_order - def update_parent(self, new_parent: "PandasDataframe"): + def update_parent(self, new_parent: PandasDataframe): """ Set new parent dataframe. @@ -205,7 +204,7 @@ def __str__(self): # noqa: GL08 def lazy_get( self, ids: list[Union[IndexLabel, int]], numeric_index: bool = False - ) -> "DtypesDescriptor": + ) -> DtypesDescriptor: """ Get dtypes descriptor for a subset of columns without triggering any computations. @@ -258,7 +257,7 @@ def lazy_get( columns_order=columns_order, ) - def copy(self) -> "DtypesDescriptor": + def copy(self) -> DtypesDescriptor: """ Get a copy of this descriptor. @@ -280,9 +279,7 @@ def copy(self) -> "DtypesDescriptor": _schema_is_known=self._schema_is_known, ) - def set_index( - self, new_index: Union[pandas.Index, "ModinIndex"] - ) -> "DtypesDescriptor": + def set_index(self, new_index: Union[pandas.Index, ModinIndex]) -> DtypesDescriptor: """ Set new column names for this descriptor. @@ -327,7 +324,7 @@ def set_index( } return new_self - def equals(self, other: "DtypesDescriptor") -> bool: + def equals(self, other: DtypesDescriptor) -> bool: """ Compare two descriptors for equality. @@ -444,25 +441,25 @@ def to_series(self) -> pandas.Series: self.materialize() return pandas.Series(self._known_dtypes) - def get_dtypes_set(self) -> set[np.dtype]: + def get_dtypes_set(self) -> set[DtypeObj]: """ Get a set of dtypes from the descriptor. Returns ------- - set[np.dtype] + set[DtypeObj] """ if len(self._cols_with_unknown_dtypes) > 0 or not self._know_all_names: self._materialize_cols_with_unknown_dtypes() - known_dtypes: set[np.dtype] = set(self._known_dtypes.values()) + known_dtypes: set[DtypeObj] = set(self._known_dtypes.values()) if self._remaining_dtype is not None: known_dtypes.add(self._remaining_dtype) return known_dtypes @classmethod def _merge_dtypes( - cls, values: list[Union["DtypesDescriptor", pandas.Series, None]] - ) -> "DtypesDescriptor": + cls, values: list[Union[DtypesDescriptor, pandas.Series, None]] + ) -> DtypesDescriptor: """ Union columns described by ``values`` and compute common dtypes for them. @@ -555,8 +552,8 @@ def combine_dtypes(row): @classmethod def concat( - cls, values: list[Union["DtypesDescriptor", pandas.Series, None]], axis: int = 0 - ) -> "DtypesDescriptor": + cls, values: list[Union[DtypesDescriptor, pandas.Series, None]], axis: int = 0 + ) -> DtypesDescriptor: """ Concatenate dtypes descriptors into a single descriptor. @@ -780,14 +777,13 @@ def is_materialized(self) -> bool: """ return isinstance(self._value, pandas.Series) - # TODO: pyarrow backend - def get_dtypes_set(self) -> set[np.dtype]: + def get_dtypes_set(self) -> set[DtypeObj]: """ Get a set of dtypes from the descriptor. Returns ------- - set[np.dtype] + set[DtypeObj] """ if isinstance(self._value, DtypesDescriptor): return self._value.get_dtypes_set() @@ -1201,7 +1197,7 @@ def _materialize_categories(self): def get_categories_dtype( cdt: Union[LazyProxyCategoricalDtype, pandas.CategoricalDtype] -): +) -> DtypeObj: """ Get the categories dtype. @@ -1220,7 +1216,7 @@ def get_categories_dtype( ) -def extract_dtype(value): +def extract_dtype(value) -> Union[DtypeObj, pandas.Series]: """ Extract dtype(s) from the passed `value`. @@ -1230,7 +1226,7 @@ def extract_dtype(value): Returns ------- - numpy.dtype or pandas.Series of numpy.dtypes + DtypeObj or pandas.Series of DtypeObj """ from modin.pandas.utils import is_scalar diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py index 454b75c442b..7d5293b1017 100644 --- a/modin/core/storage_formats/pandas/aggregations.py +++ b/modin/core/storage_formats/pandas/aggregations.py @@ -65,6 +65,7 @@ def corr_method( qc._modin_frame.copy_columns_cache(), ) new_dtypes = pandas.Series( + # TODO: pyarrow backend? np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)), index=new_columns, ) @@ -74,6 +75,7 @@ def corr_method( new_columns = old_dtypes[old_dtypes.map(is_numeric_dtype)].index new_index = new_columns.copy() new_dtypes = pandas.Series( + # TODO: pyarrow backend? np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)), index=new_columns, ) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 320ddad7f4e..df0c2b9f436 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1627,6 +1627,7 @@ def prod( return Series( [np.nan] * len(new_index), index=new_index, + # TODO: pyarrow backend? dtype=pandas.api.types.pandas_dtype("object"), ) @@ -2152,6 +2153,7 @@ def sum( return Series( [np.nan] * len(new_index), index=new_index, + # TODO: pyarrow backend? dtype=pandas.api.types.pandas_dtype("object"), ) @@ -3061,6 +3063,7 @@ def _validate_dtypes_min_max(self, axis, numeric_only) -> DataFrame: ): # check if there are columns with dtypes datetime or timedelta if all( + # TODO: pyarrow backend? dtype != pandas.api.types.pandas_dtype("datetime64[ns]") and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]") for dtype in self.dtypes @@ -3097,6 +3100,7 @@ def _validate_dtypes_sum_prod_mean( not axis and numeric_only is False and any( + # TODO: pyarrow backend? dtype == pandas.api.types.pandas_dtype("datetime64[ns]") for dtype in self.dtypes ) @@ -3117,6 +3121,7 @@ def _validate_dtypes_sum_prod_mean( ): # check if there are columns with dtypes datetime or timedelta if all( + # TODO: pyarrow backend? dtype != pandas.api.types.pandas_dtype("datetime64[ns]") and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]") for dtype in self.dtypes diff --git a/modin/tests/pandas/dataframe/test_reduce.py b/modin/tests/pandas/dataframe/test_reduce.py index eba5b9341af..dcbdc39e17b 100644 --- a/modin/tests/pandas/dataframe/test_reduce.py +++ b/modin/tests/pandas/dataframe/test_reduce.py @@ -327,6 +327,7 @@ def test_sum(data, axis, skipna, is_transposed, request): @pytest.mark.parametrize("dtype", ["int64", "Int64"]) def test_dtype_consistency(dtype): # test for issue #6781 + # TODO: add pyarrow dtype res_dtype = pd.DataFrame([1, 2, 3, 4], dtype=dtype).sum().dtype assert res_dtype == pandas.api.types.pandas_dtype(dtype) From e24201fc31a4dd6d8fc2981f32557720149dda39 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 3 May 2024 20:49:10 +0200 Subject: [PATCH 18/50] add construct_dtype Signed-off-by: Anatoly Myachev --- .../dataframe/pandas/dataframe/dataframe.py | 50 +++++++++++-------- .../pandas/partitioning/partition_manager.py | 8 ++- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 78d297b11b0..c0f8dfa28d1 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -101,11 +101,13 @@ class PandasDataframe( _partition_mgr_cls: PandasDataframePartitionManager = None _query_compiler_cls = PandasQueryCompiler # These properties flag whether or not we are deferring the metadata synchronization - _deferred_index = False - _deferred_column = False + _deferred_index: bool = False + _deferred_column: bool = False + _index_cache: ModinIndex = None _columns_cache: ModinIndex = None _dtypes: Optional[ModinDtypes] = None + _pandas_backend: str = None @pandas.util.cache_readonly def __constructor__(self) -> Callable[..., PandasDataframe]: @@ -126,6 +128,7 @@ def __init__( row_lengths=None, column_widths=None, dtypes: Optional[Union[pandas.Series, ModinDtypes, Callable]] = None, + pandas_backend: Optional[str] = None, ): self._partitions = partitions self.set_index_cache(index) @@ -133,6 +136,7 @@ def __init__( self._row_lengths_cache = row_lengths self._column_widths_cache = column_widths self.set_dtypes_cache(dtypes) + self._pandas_backend = pandas_backend self._validate_axes_lengths() self._filter_empties(compute_metadata=False) @@ -1668,7 +1672,7 @@ def astype(self, col_dtypes, errors: str = "raise"): if new_dtypes is None: new_dtypes = self_dtypes.copy() # Update the new dtype series to the proper pandas dtype - # TODO: pyarrow backend? + # TODO: pyarrow backend? We don't need to add an implicit backend for `astype` new_dtype = pandas.api.types.pandas_dtype(dtype) if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 @@ -1698,8 +1702,7 @@ def astype_builder(df): # Assume that the dtype is a scalar. if not (col_dtypes == self_dtypes).all(): new_dtypes = self_dtypes.copy() - # TODO: pyarrow backend? - new_dtype = pandas.api.types.pandas_dtype(col_dtypes) + new_dtype = self.construct_dtype(col_dtypes) if Engine.get() == "Dask" and hasattr(new_dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 _ = new_dtype._materialize_categories() @@ -2089,9 +2092,8 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None): if dtypes == "copy": dtypes = self.copy_dtypes_cache() elif dtypes is not None: - # TODO: pyarrow backend? dtypes = pandas.Series( - [pandas.api.types.pandas_dtype(dtypes)] * len(new_axes[1]), + [self.construct_dtype(dtypes, self._pandas_backend)] * len(new_axes[1]), index=new_axes[1], ) @@ -2239,9 +2241,8 @@ def map( if isinstance(new_columns, ModinIndex): # Materializing lazy columns in order to build dtype's index new_columns = new_columns.get(return_lengths=False) - # TODO: consider backend dtypes = pandas.Series( - [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns), + [self.construct_dtype(dtypes, self._pandas_backend)] * len(new_columns), index=new_columns, ) return self.__constructor__( @@ -3382,6 +3383,14 @@ def broadcast_apply_select_indices( new_partitions, index=new_index, columns=new_columns ) + def construct_dtype(dtype: str, backend: Optional[str]): + if backend is None: + return pandas.api.types.pandas_dtype(dtype) + elif backend == "pyarrow": + return pandas.api.types.pandas_dtype(f"{dtype}[{backend}]") + else: + raise NotImplementedError + @lazy_metadata_decorator(apply_axis="both") def broadcast_apply_full_axis( self, @@ -3495,20 +3504,15 @@ def broadcast_apply_full_axis( else: if new_columns is None: assert not is_list_like(dtypes) - # need something like this utility: construct_dtype() - kw["dtypes"] = ModinDtypes( - DtypesDescriptor( - # TODO: pyarrow backend - remaining_dtype=pandas.api.types.pandas_dtype(dtypes) - ) - ) + dtype = self.construct_dtype(dtypes, self._pandas_backend) + kw["dtypes"] = ModinDtypes(DtypesDescriptor(remaining_dtype=dtype)) else: kw["dtypes"] = ( pandas.Series(dtypes, index=new_columns) if is_list_like(dtypes) else pandas.Series( - # TODO: pyarrow backend - [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns), + [self.construct_dtype(dtypes, self._pandas_backend)] + * len(new_columns), index=new_columns, ) ) @@ -4486,8 +4490,8 @@ def from_pandas(cls, df): new_index = df.index new_columns = df.columns new_dtypes = df.dtypes - new_frame, new_lengths, new_widths = cls._partition_mgr_cls.from_pandas( - df, True + new_frame, new_lengths, new_widths, backend = ( + cls._partition_mgr_cls.from_pandas(df, True) ) return cls( new_frame, @@ -4496,6 +4500,7 @@ def from_pandas(cls, df): new_lengths, new_widths, dtypes=new_dtypes, + backend=backend, ) @classmethod @@ -4513,7 +4518,7 @@ def from_arrow(cls, at): PandasDataframe New Modin DataFrame. """ - new_frame, new_lengths, new_widths = cls._partition_mgr_cls.from_arrow( + new_frame, new_lengths, new_widths, backend = cls._partition_mgr_cls.from_arrow( at, return_dims=True ) new_columns = Index.__new__(Index, data=at.column_names, dtype="O") @@ -4529,6 +4534,7 @@ def from_arrow(cls, at): row_lengths=new_lengths, column_widths=new_widths, dtypes=new_dtypes, + backend=backend, ) @classmethod @@ -4708,7 +4714,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): ) @classmethod - def from_dataframe(cls, df: "ProtocolDataframe") -> PandasDataframe: + def from_dataframe(cls, df: ProtocolDataframe) -> PandasDataframe: """ Convert a DataFrame implementing the dataframe exchange protocol to a Core Modin Dataframe. diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index 0da5303fba9..3a6ad333df6 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -928,10 +928,13 @@ def update_bar(f): parts = cls.split_pandas_df_into_partitions( df, row_chunksize, col_chunksize, update_bar ) + backend = None + if any(isinstance(x, pandas.ArrowDtype) for x in df.dtypes): + backend = "pyarrow" if ProgressBar.get(): pbar.close() if not return_dims: - return parts + return parts, backend else: row_lengths = [ ( @@ -949,7 +952,7 @@ def update_bar(f): ) for i in range(0, len(df.columns), col_chunksize) ] - return parts, row_lengths, col_widths + return parts, row_lengths, col_widths, backend @classmethod def from_arrow(cls, at, return_dims=False): @@ -969,6 +972,7 @@ def from_arrow(cls, at, return_dims=False): np.ndarray or (np.ndarray, row_lengths, col_widths) A NumPy array with partitions (with dimensions or not). """ + # also return backend return cls.from_pandas(at.to_pandas(), return_dims=return_dims) @classmethod From 4dba613a0f77a62f7ffad3604e555704d81a265b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 6 May 2024 12:30:17 +0200 Subject: [PATCH 19/50] fix Signed-off-by: Anatoly Myachev --- modin/pandas/dataframe.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index df0c2b9f436..c56a69726a9 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1624,11 +1624,14 @@ def prod( and min_count > len(axis_to_apply) ): new_index = self.columns if not axis else self.index + # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10) + # 0 + # dtype: int64[pyarrow] return Series( [np.nan] * len(new_index), index=new_index, # TODO: pyarrow backend? - dtype=pandas.api.types.pandas_dtype("object"), + dtype=pandas.api.types.pandas_dtype("float64"), ) data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True) @@ -2154,7 +2157,7 @@ def sum( [np.nan] * len(new_index), index=new_index, # TODO: pyarrow backend? - dtype=pandas.api.types.pandas_dtype("object"), + dtype=pandas.api.types.pandas_dtype("float64"), ) data = self._validate_dtypes_sum_prod_mean( From ea05389c797a0da2d30c7ec38bc464c145da308e Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 6 May 2024 14:01:19 +0200 Subject: [PATCH 20/50] fix Signed-off-by: Anatoly Myachev --- modin/core/dataframe/pandas/dataframe/dataframe.py | 2 +- modin/core/storage_formats/base/query_compiler.py | 14 +++++++++++++- modin/pandas/dataframe.py | 6 ++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index f8e7fb7d91b..0d2e9170eeb 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -3418,7 +3418,7 @@ def broadcast_apply_select_indices( new_partitions, index=new_index, columns=new_columns ) - def construct_dtype(dtype: str, backend: Optional[str]): + def construct_dtype(self, dtype: str, backend: Optional[str]): if backend is None: return pandas.api.types.pandas_dtype(dtype) elif backend == "pyarrow": diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index c6e09d00dfb..dbf964ee24a 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -21,7 +21,7 @@ import abc import warnings -from typing import Callable, Hashable, List, Optional +from typing import TYPE_CHECKING, Callable, Hashable, List, Optional import numpy as np import pandas @@ -52,6 +52,10 @@ from . import doc_utils +if TYPE_CHECKING: + # TODO: should be ModinDataframe + from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe + def _get_axis(axis): """ @@ -126,6 +130,8 @@ class BaseQueryCompiler( for a list of requirements for subclassing this object. """ + _modin_frame: PandasDataframe + def __wrap_in_qc(self, obj): """ Wrap `obj` in query compiler. @@ -6747,6 +6753,12 @@ def case_when(self, caselist): # noqa: PR01, RT01, D200 ] return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist) + def construct_dtype(self, dtype: str, backend: Optional[str]): + return self._modin_frame.construct_dtype(dtype, backend) + + def get_backend(self) -> str: + return self._modin_frame._pandas_backend + def repartition(self, axis=None): """ Repartitioning QueryCompiler objects to get ideal partitions inside. diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index c56a69726a9..5b0ea87053a 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1622,6 +1622,8 @@ def prod( skipna is not False and numeric_only is False and min_count > len(axis_to_apply) + # Type inference is not so simple for pyarrow + and self._query_compiler.get_backend() == "default" ): new_index = self.columns if not axis else self.index # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10) @@ -1630,7 +1632,6 @@ def prod( return Series( [np.nan] * len(new_index), index=new_index, - # TODO: pyarrow backend? dtype=pandas.api.types.pandas_dtype("float64"), ) @@ -2151,12 +2152,13 @@ def sum( skipna is not False and numeric_only is False and min_count > len(axis_to_apply) + # Type inference is not so simple for pyarrow + and self._query_compiler.get_backend() == "default" ): new_index = self.columns if not axis else self.index return Series( [np.nan] * len(new_index), index=new_index, - # TODO: pyarrow backend? dtype=pandas.api.types.pandas_dtype("float64"), ) From 005f4802c3520e8c1be6c5c5e95a048aa186ec1a Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 6 May 2024 16:02:00 +0200 Subject: [PATCH 21/50] fix Signed-off-by: Anatoly Myachev --- modin/core/dataframe/pandas/dataframe/dataframe.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 0d2e9170eeb..6a32741b3cb 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -1702,7 +1702,7 @@ def astype_builder(df): # Assume that the dtype is a scalar. if not (col_dtypes == self_dtypes).all(): new_dtypes = self_dtypes.copy() - new_dtype = self.construct_dtype(col_dtypes) + new_dtype = self.construct_dtype(col_dtypes, self._pandas_backend) if Engine.get() == "Dask" and hasattr(new_dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 _ = new_dtype._materialize_categories() @@ -4525,7 +4525,7 @@ def from_pandas(cls, df): new_index = df.index new_columns = df.columns new_dtypes = df.dtypes - new_frame, new_lengths, new_widths, backend = ( + new_frame, new_lengths, new_widths, pandas_backend = ( cls._partition_mgr_cls.from_pandas(df, True) ) return cls( @@ -4535,7 +4535,7 @@ def from_pandas(cls, df): new_lengths, new_widths, dtypes=new_dtypes, - backend=backend, + pandas_backend=pandas_backend, ) @classmethod @@ -4553,8 +4553,8 @@ def from_arrow(cls, at): PandasDataframe New Modin DataFrame. """ - new_frame, new_lengths, new_widths, backend = cls._partition_mgr_cls.from_arrow( - at, return_dims=True + new_frame, new_lengths, new_widths, pandas_backend = ( + cls._partition_mgr_cls.from_arrow(at, return_dims=True) ) new_columns = Index.__new__(Index, data=at.column_names, dtype="O") new_index = Index.__new__(RangeIndex, data=range(at.num_rows)) @@ -4569,7 +4569,7 @@ def from_arrow(cls, at): row_lengths=new_lengths, column_widths=new_widths, dtypes=new_dtypes, - backend=backend, + pandas_backend=pandas_backend, ) @classmethod From 0d34bea374010c2465642456757c8fc68be45dc4 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 6 May 2024 16:53:24 +0200 Subject: [PATCH 22/50] fix Signed-off-by: Anatoly Myachev --- modin/core/storage_formats/pandas/query_compiler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index b139ad4ff56..866a2456fae 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -4522,7 +4522,9 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item): # to the following warning. Example: "FutureWarning: Setting an item of incompatible # dtype is deprecated and will raise in a future error of pandas. Value '[1.38629436]' # has dtype incompatible with int64, please explicitly cast to a compatible dtype first." - partition.iloc[row_internal_indices, col_internal_indices] = item.copy() + partition.iloc[row_internal_indices, col_internal_indices] = ( + item.copy() if hasattr(item, "copy") else item + ) return partition if not is_scalar(item): From aac70970ab06302af0817081078966a0076914bd Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 6 May 2024 17:02:17 +0200 Subject: [PATCH 23/50] fix Signed-off-by: Anatoly Myachev --- modin/tests/pandas/dataframe/test_binary.py | 2 ++ modin/tests/pandas/dataframe/test_default.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index 062250bbd8a..a8858cf799a 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -465,6 +465,8 @@ def test_non_commutative_multiply(): eval_general(modin_df, pandas_df, lambda s: s * integer) +# TODO: just for developing purpose; remove `xfail` mark +@pytest.mark.xfail @pytest.mark.parametrize( "op", [ diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 0f3ca39fd72..173e90e8762 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -528,6 +528,8 @@ def test_info(data, verbose, max_cols, memory_usage, show_counts): assert modin_info[1:] == pandas_info[1:] +# TODO: just for developing purpose; remove `xfail` mark +@pytest.mark.xfail @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("numeric_only", [False, True]) From 258c3b9a70b294bc113fd80b6a44d770853f9086 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 6 May 2024 21:10:31 +0200 Subject: [PATCH 24/50] fix Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/fold.py | 13 +++- .../dataframe/pandas/dataframe/dataframe.py | 71 ++++++++++++++++--- .../storage_formats/pandas/aggregations.py | 12 ++-- .../storage_formats/pandas/query_compiler.py | 24 ++++++- .../dataframe/pandas/partitions.py | 4 +- modin/logging/logger_decorator.py | 17 +++-- modin/pandas/general.py | 2 +- modin/tests/pandas/dataframe/test_binary.py | 17 +++-- 8 files changed, 132 insertions(+), 28 deletions(-) diff --git a/modin/core/dataframe/algebra/fold.py b/modin/core/dataframe/algebra/fold.py index 419a0b56903..9f6673a3e0a 100644 --- a/modin/core/dataframe/algebra/fold.py +++ b/modin/core/dataframe/algebra/fold.py @@ -13,14 +13,21 @@ """Module houses builder class for Fold operator.""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Callable + from .operator import Operator +if TYPE_CHECKING: + from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler + class Fold(Operator): """Builder class for Fold functions.""" @classmethod - def register(cls, fold_function): + def register(cls, fold_function) -> Callable[..., PandasQueryCompiler]: """ Build Fold operator that will be performed across rows/columns. @@ -35,7 +42,9 @@ def register(cls, fold_function): Function that takes query compiler and executes Fold function. """ - def caller(query_compiler, fold_axis=None, *args, **kwargs): + def caller( + query_compiler: PandasQueryCompiler, fold_axis=None, *args, **kwargs + ) -> PandasQueryCompiler: """ Execute Fold function against passed query compiler. diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 6a32741b3cb..8f642592856 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -881,7 +881,7 @@ def synchronize_labels(self, axis=None): else: self._deferred_column = True - def _propagate_index_objs(self, axis=None): + def _propagate_index_objs(self, axis=None) -> None: """ Synchronize labels by applying the index object for specific `axis` to the `self._partitions` lazily. @@ -1320,6 +1320,7 @@ def _take_2d_positional( new_row_lengths, new_col_widths, new_dtypes, + pandas_backend=self._pandas_backend, ) return self._maybe_reorder_labels( @@ -1494,6 +1495,7 @@ def from_labels_executor(df, **kwargs): row_lengths=self._row_lengths_cache, column_widths=new_column_widths, dtypes=new_dtypes, + pandas_backend=self._pandas_backend, ) # Set flag for propagating deferred row labels across dataframe partitions result.synchronize_labels(axis=0) @@ -1620,7 +1622,13 @@ def _reorder_labels(self, row_positions=None, col_positions=None): col_idx = self.copy_columns_cache(copy_lengths=True) new_widths = self._column_widths_cache return self.__constructor__( - ordered_cols, row_idx, col_idx, new_lengths, new_widths, new_dtypes + ordered_cols, + row_idx, + col_idx, + new_lengths, + new_widths, + new_dtypes, + pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis=None) @@ -1640,6 +1648,7 @@ def copy(self): self._row_lengths_cache, self._column_widths_cache, self.copy_dtypes_cache(), + pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") @@ -1742,6 +1751,7 @@ def astype_builder(df): self._row_lengths_cache, self._column_widths_cache, new_dtypes, + pandas_backend=self._pandas_backend, ) def numeric_columns(self, include_bool=True): @@ -2102,6 +2112,7 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None): *new_axes, *new_axes_lengths, dtypes, + pandas_backend=self._pandas_backend, ) return result @@ -2287,6 +2298,7 @@ def map( self._row_lengths_cache, self._column_widths_cache, dtypes=dtypes, + pandas_backend=self._pandas_backend, ) def window( @@ -2366,6 +2378,7 @@ def fold(self, axis, func, new_columns=None): self.copy_columns_cache(copy_lengths=True), self._row_lengths_cache, self._column_widths_cache, + pandas_backend=self._pandas_backend, ) def infer_objects(self) -> PandasDataframe: @@ -2412,6 +2425,7 @@ def infer_types(self, col_labels: List[str]) -> PandasDataframe: self._row_lengths_cache, self._column_widths_cache, new_dtypes, + pandas_backend=self._pandas_backend, ) def join( @@ -2517,6 +2531,7 @@ def combine_and_apply( self._row_lengths_cache, [len(self.columns)] if self.has_materialized_columns else None, self.copy_dtypes_cache(), + pandas_backend=self._pandas_backend, ) else: modin_frame = self @@ -2820,6 +2835,7 @@ def filter(self, axis: Union[Axis, int], condition: Callable) -> PandasDataframe *new_axes, *new_lengths, self.copy_dtypes_cache() if axis == Axis.COL_WISE else None, + pandas_backend=self._pandas_backend, ) def filter_by_types(self, types: List[Hashable]) -> PandasDataframe: @@ -2873,7 +2889,12 @@ def explode(self, axis: Union[int, Axis], func: Callable) -> PandasDataframe: 1, partitions ) return self.__constructor__( - partitions, new_index, new_columns, row_lengths, column_widths + partitions, + new_index, + new_columns, + row_lengths, + column_widths, + pandas_backend=self._pandas_backend, ) def combine(self) -> PandasDataframe: @@ -2901,6 +2922,7 @@ def combine(self) -> PandasDataframe: else None ), dtypes=self.copy_dtypes_cache(), + pandas_backend=self._pandas_backend, ) result.synchronize_labels() return result @@ -3050,7 +3072,13 @@ def apply_full_axis_select_indices( if new_columns is None: new_columns = self.columns if axis == 0 else None return self.__constructor__( - new_partitions, new_index, new_columns, None, None, dtypes=new_dtypes + new_partitions, + new_index, + new_columns, + None, + None, + dtypes=new_dtypes, + pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") @@ -3145,6 +3173,7 @@ def apply_select_indices( lengths_objs[0], lengths_objs[1], new_dtypes, + pandas_backend=self._pandas_backend, ) else: # We are applying over both axes here, so make sure we have all the right @@ -3172,6 +3201,7 @@ def apply_select_indices( self._row_lengths_cache, self._column_widths_cache, new_dtypes, + pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") @@ -3277,6 +3307,7 @@ def _pick_axis(get_axis, sizes_cache): new_row_lengths, new_column_widths, dtypes=dtypes, + pandas_backend=self._pandas_backend, ) def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all): @@ -3415,7 +3446,10 @@ def broadcast_apply_select_indices( keep_remaining, ) return self.__constructor__( - new_partitions, index=new_index, columns=new_columns + new_partitions, + index=new_index, + columns=new_columns, + pandas_backend=self._pandas_backend, ) def construct_dtype(self, dtype: str, backend: Optional[str]): @@ -3611,7 +3645,11 @@ def broadcast_apply_full_axis( kw["column_widths"] = self._column_widths_cache result = self.__constructor__( - new_partitions, index=new_index, columns=new_columns, **kw + new_partitions, + index=new_index, + columns=new_columns, + **kw, + pandas_backend=self._pandas_backend, ) if sync_labels and new_index is not None: result.synchronize_labels(axis=0) @@ -3833,6 +3871,7 @@ def n_ary_op( self.copy_columns_cache(copy_lengths=True), row_lengths, self._column_widths_cache, + pandas_backend=self._pandas_backend, ) new_right_frames = [ self.__constructor__( @@ -3841,6 +3880,7 @@ def n_ary_op( right_frame.copy_columns_cache(copy_lengths=True), row_lengths, right_frame._column_widths_cache, + pandas_backend=self._pandas_backend, ) for right_parts, right_frame in zip(list_of_right_parts, right_frames) ] @@ -3878,6 +3918,7 @@ def n_ary_op( row_lengths, column_widths, dtypes, + pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") @@ -4005,7 +4046,13 @@ def _compute_new_widths(): new_widths = None return self.__constructor__( - new_partitions, new_index, new_columns, new_lengths, new_widths, new_dtypes + new_partitions, + new_index, + new_columns, + new_lengths, + new_widths, + new_dtypes, + pandas_backend=self._pandas_backend, ) def _apply_func_to_range_partitioning_broadcast( @@ -4080,6 +4127,7 @@ def _apply_func_to_range_partitioning_broadcast( index=new_index, columns=new_columns, dtypes=new_dtypes, + pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") @@ -4428,6 +4476,7 @@ def join_cols(df, *cols): new_partitions, index=result.copy_index_cache(), row_lengths=result._row_lengths_cache, + pandas_backend=self._pandas_backend, ) if ( @@ -4504,7 +4553,10 @@ def groupby_reduce( axis, self._partitions, by_parts, map_func, reduce_func, apply_indices ) return self.__constructor__( - new_partitions, index=new_index, columns=new_columns + new_partitions, + index=new_index, + columns=new_columns, + pandas_backend=self._pandas_backend, ) @classmethod @@ -4689,6 +4741,7 @@ def transpose(self): self._column_widths_cache, self._row_lengths_cache, dtypes=new_dtypes, + pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") @@ -4876,6 +4929,7 @@ def remote_fn(df, name, caselist): # pragma: no cover columns, row_lengths, column_widths, + pandas_backend=self._pandas_backend, ) for part in list_of_right_parts ) @@ -4947,4 +5001,5 @@ def map_data( index=self.index, row_lengths=lengths, column_widths=[1], + pandas_backend=self._pandas_backend, ) diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py index 7d5293b1017..6c2e795a523 100644 --- a/modin/core/storage_formats/pandas/aggregations.py +++ b/modin/core/storage_formats/pandas/aggregations.py @@ -13,6 +13,8 @@ """Contains implementations for aggregation functions.""" +from __future__ import annotations + from enum import Enum from typing import TYPE_CHECKING, Callable, Tuple @@ -38,7 +40,7 @@ class Method(Enum): @classmethod def build_corr_method( cls, - ) -> Callable[["PandasQueryCompiler", str, int, bool], "PandasQueryCompiler"]: + ) -> Callable[[PandasQueryCompiler, str, int, bool], PandasQueryCompiler]: """ Build a query compiler method computing the correlation matrix. @@ -49,12 +51,12 @@ def build_corr_method( """ def corr_method( - qc: "PandasQueryCompiler", + qc: PandasQueryCompiler, method: str, min_periods: int = 1, numeric_only: bool = True, - ) -> "PandasQueryCompiler": - if method != "pearson": + ) -> PandasQueryCompiler: + if method != "pearson" or qc._modin_frame._pandas_backend == "pyarrow": return super(type(qc), qc).corr( method=method, min_periods=min_periods, numeric_only=numeric_only ) @@ -103,7 +105,7 @@ def corr_method( @classmethod def build_cov_method( cls, - ) -> Callable[["PandasQueryCompiler", int, int], "PandasQueryCompiler"]: + ) -> Callable[[PandasQueryCompiler, int, int], PandasQueryCompiler]: """ Build a query compiler method computing the covariance matrix. diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 866a2456fae..83d42965a9e 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -1858,7 +1858,29 @@ def isin_func(df, values): map = Map.register(pandas.DataFrame.map) # Will it work with pyarrow backend? conj = Map.register(lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df))) - convert_dtypes = Fold.register(pandas.DataFrame.convert_dtypes) + + def convert_dtypes( + self, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + convert_floating: bool = True, + dtype_backend: str = "numpy_nullable", + ): + result = Fold.register(pandas.DataFrame.convert_dtypes)( + self, + infer_objects=infer_objects, + convert_string=convert_string, + convert_integer=convert_integer, + convert_boolean=convert_boolean, + convert_floating=convert_floating, + dtype_backend=dtype_backend, + ) + if dtype_backend == "pyarrow": + result._modin_frame._pandas_backend = "pyarrow" + return result + invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy") isna = Map.register(pandas.DataFrame.isna, dtypes="bool") # better way to distinguish methods for NumPy API? diff --git a/modin/distributed/dataframe/pandas/partitions.py b/modin/distributed/dataframe/pandas/partitions.py index cac3bec93b6..62a05ff81d3 100644 --- a/modin/distributed/dataframe/pandas/partitions.py +++ b/modin/distributed/dataframe/pandas/partitions.py @@ -90,7 +90,7 @@ def unwrap_partitions( f"Only API Layer objects may be passed in here, got {type(api_layer_object)} instead." ) - modin_frame = api_layer_object._query_compiler._modin_frame # type: ignore[attr-defined] + modin_frame = api_layer_object._query_compiler._modin_frame modin_frame._propagate_index_objs(None) if axis is None: @@ -122,7 +122,7 @@ def get_block(partition: PartitionUnionType) -> np.ndarray: ] actual_engine = type( - api_layer_object._query_compiler._modin_frame._partitions[0][0] # type: ignore[attr-defined] + api_layer_object._query_compiler._modin_frame._partitions[0][0] ).__name__ if actual_engine in ( "PandasOnRayDataframePartition", diff --git a/modin/logging/logger_decorator.py b/modin/logging/logger_decorator.py index 301fb02562b..662f7d1de73 100644 --- a/modin/logging/logger_decorator.py +++ b/modin/logging/logger_decorator.py @@ -19,7 +19,7 @@ from functools import wraps from types import FunctionType, MethodType -from typing import Any, Callable, Dict, Optional, Tuple, Type, Union +from typing import Any, Callable, Dict, Optional, Tuple, Type, TypeVar, Union, overload from modin.config import LogMode @@ -28,6 +28,9 @@ _MODIN_LOGGER_NOWRAP = "__modin_logging_nowrap__" +Fn = TypeVar("Fn", bound=Callable) + + def disable_logging(func: Callable) -> Callable: """ Disable logging of one particular function. Useful for decorated classes. @@ -46,11 +49,17 @@ def disable_logging(func: Callable) -> Callable: return func +@overload +def enable_logging(modin_layer: Fn) -> Fn: + # This helps preserve typings when the decorator is used without parentheses + ... + + def enable_logging( - modin_layer: Union[str, Callable, Type] = "PANDAS-API", + modin_layer: Union[str, Fn, Type] = "PANDAS-API", name: Optional[str] = None, log_level: LogLevel = LogLevel.INFO, -) -> Callable: +) -> Callable[[Fn], Fn]: """ Log Decorator used on specific Modin functions or classes. @@ -76,7 +85,7 @@ def enable_logging( # def func() return enable_logging()(modin_layer) - def decorator(obj: Any) -> Any: + def decorator(obj: Fn) -> Fn: """Decorate function or class to add logs to Modin API function(s).""" if isinstance(obj, type): seen: Dict[Any, Any] = {} diff --git a/modin/pandas/general.py b/modin/pandas/general.py index aeff9986f35..6c79752847d 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -415,7 +415,7 @@ def value_counts( ) -@_inherit_docstrings(pandas.concat, apilink="pandas.concat") +# @_inherit_docstrings(pandas.concat, apilink="pandas.concat") @enable_logging def concat( objs: "Iterable[DataFrame | Series] | Mapping[Hashable, DataFrame | Series]", diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index a8858cf799a..a1070d892b7 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -85,9 +85,16 @@ def test_math_functions(other, axis, op): # lambda == "series_or_list" pytest.xfail(reason="different behavior") - eval_general( - *create_test_dfs(data), lambda df: getattr(df, op)(other(df, axis), axis=axis) - ) + md_df, pd_df = create_test_dfs(data) + if op in ("mod", "rmod") and any("pyarrow" in str(dtype) for dtype in pd_df.dtypes): + with pytest.raises(NotImplementedError): + eval_general( + md_df, pd_df, lambda df: getattr(df, op)(other(df, axis), axis=axis) + ) + else: + eval_general( + md_df, pd_df, lambda df: getattr(df, op)(other(df, axis), axis=axis) + ) @pytest.mark.parametrize("other", [lambda df: 2, lambda df: df]) @@ -465,8 +472,8 @@ def test_non_commutative_multiply(): eval_general(modin_df, pandas_df, lambda s: s * integer) -# TODO: just for developing purpose; remove `xfail` mark -@pytest.mark.xfail +# TODO: just for developing purpose; remove `skip` mark +@pytest.mark.skip @pytest.mark.parametrize( "op", [ From 068f67ddc6c99a62095794cde2085d2120944f0d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 6 May 2024 21:26:01 +0200 Subject: [PATCH 25/50] fix Signed-off-by: Anatoly Myachev --- modin/core/dataframe/pandas/dataframe/dataframe.py | 2 +- modin/core/storage_formats/pandas/query_compiler.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 8f642592856..c33377a08ae 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -3453,7 +3453,7 @@ def broadcast_apply_select_indices( ) def construct_dtype(self, dtype: str, backend: Optional[str]): - if backend is None: + if backend is None or dtype == "category": return pandas.api.types.pandas_dtype(dtype) elif backend == "pyarrow": return pandas.api.types.pandas_dtype(f"{dtype}[{backend}]") diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 83d42965a9e..33e05a6c491 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2272,6 +2272,9 @@ def clip(self, lower, upper, **kwargs): corr = CorrCovBuilder.build_corr_method() def cov(self, min_periods=None, ddof=1): + if self._modin_frame._pandas_backend == "pyarrow": + return super().cov(min_periods=min_periods, ddof=ddof) + # _nancorr use numpy which incompatible with pandas dataframes on pyarrow return self._nancorr(min_periods=min_periods, cov=True, ddof=ddof) def _nancorr(self, min_periods=1, cov=False, ddof=1): From 45c1d1ff51a65146bec27937fbae1042b8dc85fd Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 7 May 2024 10:56:05 +0200 Subject: [PATCH 26/50] fix Signed-off-by: Anatoly Myachev --- modin/core/dataframe/pandas/dataframe/dataframe.py | 5 +++-- modin/tests/pandas/dataframe/test_map_metadata.py | 10 +++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index c33377a08ae..604d1db03d4 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -1681,7 +1681,7 @@ def astype(self, col_dtypes, errors: str = "raise"): if new_dtypes is None: new_dtypes = self_dtypes.copy() # Update the new dtype series to the proper pandas dtype - # TODO: pyarrow backend? We don't need to add an implicit backend for `astype` + # We don't need to add an implicit backend for `astype` new_dtype = pandas.api.types.pandas_dtype(dtype) if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 @@ -1711,7 +1711,8 @@ def astype_builder(df): # Assume that the dtype is a scalar. if not (col_dtypes == self_dtypes).all(): new_dtypes = self_dtypes.copy() - new_dtype = self.construct_dtype(col_dtypes, self._pandas_backend) + # We don't need to add an implicit backend for `astype` + new_dtype = pandas.api.types.pandas_dtype(col_dtypes) if Engine.get() == "Dask" and hasattr(new_dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 _ = new_dtype._materialize_categories() diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index ab7a7fa4a31..b4980118922 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -1429,6 +1429,9 @@ def comparator(df1, df2): elif idx == 2: # FIXME: https://github.com/modin-project/modin/issues/7080 expected_exception = False + + if any("pyarrow" in str(dtype) for dtype in pandas_df.dtypes): + pytest.xfail(reason="ValueError(2)") eval_insert( modin_df, pandas_df, @@ -1683,12 +1686,13 @@ def test___neg__(request, data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___invert__(data, request): expected_exception = None + md_df, pd_df = create_test_dfs(data) if "float_nan_data" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7081 expected_exception = False - eval_general( - *create_test_dfs(data), lambda df: ~df, expected_exception=expected_exception - ) + if any("pyarrow" in str(dtype) for dtype in pd_df.dtypes): + pytest.xfail(reason="pyarrow.lib.ArrowNotImplementedError") + eval_general(md_df, pd_df, lambda df: ~df, expected_exception=expected_exception) def test___invert___bool(): From b114314f9471539226cde6dd143df05e8a2e1bed Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 7 May 2024 12:49:41 +0200 Subject: [PATCH 27/50] fix Signed-off-by: Anatoly Myachev --- modin/core/dataframe/pandas/dataframe/dataframe.py | 6 ++++-- .../implementations/hdk_on_native/calcite_serializer.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 604d1db03d4..7c8f6e2ef6a 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -21,6 +21,7 @@ from __future__ import annotations import datetime +from functools import cached_property import re from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Optional, Union @@ -109,8 +110,8 @@ class PandasDataframe( _dtypes: Optional[ModinDtypes] = None _pandas_backend: str = None - @pandas.util.cache_readonly - def __constructor__(self) -> Callable[..., PandasDataframe]: + @cached_property + def __constructor__(self) -> type[PandasDataframe]: """ Create a new instance of this object. @@ -1745,6 +1746,7 @@ def astype_builder(df): new_frame = self._partition_mgr_cls.lazy_map_partitions( self._partitions, astype_builder ) + # TODO: recompute _pandas_backend (it can be changed) return self.__constructor__( new_frame, self.copy_index_cache(copy_lengths=True), diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py index b00e73dc745..7099751dafe 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py @@ -67,7 +67,6 @@ class CalciteSerializer: "datetime64": "TIMESTAMP", } - # TODO: Is it necessary to use more general types here (not dependent on NumPy)? _INT_OPTS = { np.int8: ("TINYINT", 3), np.int16: ("SMALLINT", 5), From c597f7ffd5de330b02f11bc91845e7651e80cfa6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 8 May 2024 00:36:38 +0200 Subject: [PATCH 28/50] cleanup Signed-off-by: Anatoly Myachev --- modin/core/dataframe/pandas/dataframe/dataframe.py | 2 +- .../native/implementations/hdk_on_native/dataframe/utils.py | 6 ++++-- modin/pandas/general.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 7c8f6e2ef6a..a103921bc72 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -21,8 +21,8 @@ from __future__ import annotations import datetime -from functools import cached_property import re +from functools import cached_property from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Optional, Union import numpy as np diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py index 4f749cf0e3b..f99cc256baa 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py @@ -23,7 +23,7 @@ import pandas import pyarrow as pa from pandas.core.arrays.arrow.extension_types import ArrowIntervalType -from pandas.core.dtypes.common import _get_dtype +from pandas.core.dtypes.common import _get_dtype, is_string_dtype from pyarrow.types import is_dictionary from modin.pandas.indexing import is_range_like @@ -504,7 +504,9 @@ def to_arrow_type(dtype) -> pa.lib.DataType: ------- pa.lib.DataType """ - return pandas.api.types.pandas_dtype(dtype).pyarrow_dtype + if is_string_dtype(dtype): + return pa.from_numpy_dtype(str) + return pa.from_numpy_dtype(dtype) def get_common_arrow_type(t1: pa.lib.DataType, t2: pa.lib.DataType) -> pa.lib.DataType: diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 6c79752847d..aeff9986f35 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -415,7 +415,7 @@ def value_counts( ) -# @_inherit_docstrings(pandas.concat, apilink="pandas.concat") +@_inherit_docstrings(pandas.concat, apilink="pandas.concat") @enable_logging def concat( objs: "Iterable[DataFrame | Series] | Mapping[Hashable, DataFrame | Series]", From 9562144575fe0b8246a534ff543510b5befceb5b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 10 May 2024 13:30:09 +0200 Subject: [PATCH 29/50] updates Signed-off-by: Anatoly Myachev --- .../dataframe/pandas/dataframe/dataframe.py | 45 ++++++++++++++++--- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index a103921bc72..fd882539c47 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -108,7 +108,7 @@ class PandasDataframe( _index_cache: ModinIndex = None _columns_cache: ModinIndex = None _dtypes: Optional[ModinDtypes] = None - _pandas_backend: str = None + _pandas_backend: Optional[str] = None @cached_property def __constructor__(self) -> type[PandasDataframe]: @@ -1321,6 +1321,7 @@ def _take_2d_positional( new_row_lengths, new_col_widths, new_dtypes, + # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) @@ -1460,7 +1461,9 @@ def from_labels(self) -> PandasDataframe: new_column_names = pandas.Index(level_names, tupleize_cols=False) new_columns = new_column_names.append(self.columns) - def from_labels_executor(df, **kwargs): + def from_labels_executor( + df: pandas.DataFrame, **kwargs + ) -> pandas.DataFrame: # pragma: no cover # Setting the names here ensures that external and internal metadata always match. df.index.names = new_column_names @@ -1496,6 +1499,7 @@ def from_labels_executor(df, **kwargs): row_lengths=self._row_lengths_cache, column_widths=new_column_widths, dtypes=new_dtypes, + # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) # Set flag for propagating deferred row labels across dataframe partitions @@ -1629,6 +1633,7 @@ def _reorder_labels(self, row_positions=None, col_positions=None): new_lengths, new_widths, new_dtypes, + # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) @@ -1649,6 +1654,7 @@ def copy(self): self._row_lengths_cache, self._column_widths_cache, self.copy_dtypes_cache(), + # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) @@ -1754,6 +1760,7 @@ def astype_builder(df): self._row_lengths_cache, self._column_widths_cache, new_dtypes, + # TODO: backend can be changed pandas_backend=self._pandas_backend, ) @@ -2115,6 +2122,7 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None): *new_axes, *new_axes_lengths, dtypes, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) return result @@ -2301,6 +2309,7 @@ def map( self._row_lengths_cache, self._column_widths_cache, dtypes=dtypes, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -2381,6 +2390,7 @@ def fold(self, axis, func, new_columns=None): self.copy_columns_cache(copy_lengths=True), self._row_lengths_cache, self._column_widths_cache, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -2428,6 +2438,7 @@ def infer_types(self, col_labels: List[str]) -> PandasDataframe: self._row_lengths_cache, self._column_widths_cache, new_dtypes, + # CHECKED: backend may be changed depending on `new_cols_dtypes` pandas_backend=self._pandas_backend, ) @@ -2534,6 +2545,7 @@ def combine_and_apply( self._row_lengths_cache, [len(self.columns)] if self.has_materialized_columns else None, self.copy_dtypes_cache(), + # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) else: @@ -2838,6 +2850,7 @@ def filter(self, axis: Union[Axis, int], condition: Callable) -> PandasDataframe *new_axes, *new_lengths, self.copy_dtypes_cache() if axis == Axis.COL_WISE else None, + # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) @@ -2897,6 +2910,7 @@ def explode(self, axis: Union[int, Axis], func: Callable) -> PandasDataframe: new_columns, row_lengths, column_widths, + # TODO: need check pandas_backend=self._pandas_backend, ) @@ -2925,6 +2939,7 @@ def combine(self) -> PandasDataframe: else None ), dtypes=self.copy_dtypes_cache(), + # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) result.synchronize_labels() @@ -3081,6 +3096,7 @@ def apply_full_axis_select_indices( None, None, dtypes=new_dtypes, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -3176,6 +3192,7 @@ def apply_select_indices( lengths_objs[0], lengths_objs[1], new_dtypes, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) else: @@ -3204,6 +3221,7 @@ def apply_select_indices( self._row_lengths_cache, self._column_widths_cache, new_dtypes, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -3310,6 +3328,7 @@ def _pick_axis(get_axis, sizes_cache): new_row_lengths, new_column_widths, dtypes=dtypes, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -3375,14 +3394,14 @@ def broadcast_apply_select_indices( self, axis, func, - other, + other: PandasDataframe, apply_indices=None, numeric_indices=None, keep_remaining=False, broadcast_all=True, new_index=None, new_columns=None, - ): + ) -> PandasDataframe: """ Apply a function to select indices at specified axis and broadcast partitions of `other` Modin DataFrame. @@ -3452,6 +3471,7 @@ def broadcast_apply_select_indices( new_partitions, index=new_index, columns=new_columns, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -3576,6 +3596,7 @@ def broadcast_apply_full_axis( else: if new_columns is None: assert not is_list_like(dtypes) + # CHECKED: backend may be changed depending on function dtype = self.construct_dtype(dtypes, self._pandas_backend) kw["dtypes"] = ModinDtypes(DtypesDescriptor(remaining_dtype=dtype)) else: @@ -3583,6 +3604,7 @@ def broadcast_apply_full_axis( pandas.Series(dtypes, index=new_columns) if is_list_like(dtypes) else pandas.Series( + # CHECKED: backend may be changed depending on function [self.construct_dtype(dtypes, self._pandas_backend)] * len(new_columns), index=new_columns, @@ -3652,6 +3674,7 @@ def broadcast_apply_full_axis( index=new_index, columns=new_columns, **kw, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) if sync_labels and new_index is not None: @@ -3832,12 +3855,12 @@ def _copartition( def n_ary_op( self, op, - right_frames: list, + right_frames: list[PandasDataframe], join_type="outer", copartition_along_columns=True, labels="replace", dtypes: Optional[pandas.Series] = None, - ): + ) -> PandasDataframe: """ Perform an n-opary operation by joining with other Modin DataFrame(s). @@ -3874,6 +3897,7 @@ def n_ary_op( self.copy_columns_cache(copy_lengths=True), row_lengths, self._column_widths_cache, + # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) new_right_frames = [ @@ -3883,6 +3907,7 @@ def n_ary_op( right_frame.copy_columns_cache(copy_lengths=True), row_lengths, right_frame._column_widths_cache, + # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) for right_parts, right_frame in zip(list_of_right_parts, right_frames) @@ -3921,6 +3946,7 @@ def n_ary_op( row_lengths, column_widths, dtypes, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -4055,6 +4081,7 @@ def _compute_new_widths(): new_lengths, new_widths, new_dtypes, + # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) @@ -4130,6 +4157,7 @@ def _apply_func_to_range_partitioning_broadcast( index=new_index, columns=new_columns, dtypes=new_dtypes, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -4479,6 +4507,7 @@ def join_cols(df, *cols): new_partitions, index=result.copy_index_cache(), row_lengths=result._row_lengths_cache, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -4559,6 +4588,7 @@ def groupby_reduce( new_partitions, index=new_index, columns=new_columns, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -4744,6 +4774,7 @@ def transpose(self): self._column_widths_cache, self._row_lengths_cache, dtypes=new_dtypes, + # TODO: backend preserved? pandas_backend=self._pandas_backend, ) @@ -4932,6 +4963,7 @@ def remote_fn(df, name, caselist): # pragma: no cover columns, row_lengths, column_widths, + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) for part in list_of_right_parts @@ -5004,5 +5036,6 @@ def map_data( index=self.index, row_lengths=lengths, column_widths=[1], + # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) From 8b93500af4ec2863c6777b9a1d7098eb70aa2faf Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 12 May 2024 23:12:36 +0200 Subject: [PATCH 30/50] fixes after merge Signed-off-by: Anatoly Myachev --- modin/core/storage_formats/base/query_compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 2add02740d5..a84662c31f4 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -22,7 +22,7 @@ import abc import warnings from functools import cached_property -from typing import TYPE_CHECKING, Callable, Hashable, List, Optional +from typing import TYPE_CHECKING, Hashable, List, Optional import numpy as np import pandas From ae861e3aa6f092cc154c536bb1875e21e4237741 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 01:02:21 +0200 Subject: [PATCH 31/50] new approach Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/binary.py | 19 +---- modin/core/dataframe/algebra/fold.py | 13 +--- modin/core/dataframe/algebra/map.py | 9 +-- modin/core/dataframe/algebra/tree_reduce.py | 2 +- .../dataframe/pandas/dataframe/dataframe.py | 72 ++++++------------- .../pandas/partitioning/partition_manager.py | 5 +- .../storage_formats/base/query_compiler.py | 5 +- modin/pandas/utils.py | 9 +++ 8 files changed, 40 insertions(+), 94 deletions(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index acc84460b0e..b55df138bfd 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -85,7 +85,6 @@ def maybe_compute_dtypes_common_cast( # belong to the intersection, these will be NaN columns in the result mismatch_columns = columns_first ^ columns_second elif isinstance(second, dict): - # TODO: pyarrow backend dtypes_second = { key: pandas.api.types.pandas_dtype(type(value)) for key, value in second.items() @@ -98,7 +97,6 @@ def maybe_compute_dtypes_common_cast( mismatch_columns = columns_first.difference(columns_second) else: if isinstance(second, (list, tuple)): - # TODO: pyarrow backend second_dtypes_list = ( [pandas.api.types.pandas_dtype(type(value)) for value in second] if axis == 1 @@ -107,7 +105,6 @@ def maybe_compute_dtypes_common_cast( else [np.array(second).dtype] * len(dtypes_first) ) elif is_scalar(second) or isinstance(second, np.ndarray): - # TODO: pyarrow backend try: dtype = getattr(second, "dtype", None) or pandas.api.types.pandas_dtype( type(second) @@ -133,7 +130,6 @@ def maybe_compute_dtypes_common_cast( mismatch_columns = [] # If at least one column doesn't match, the result of the non matching column would be nan. - # TODO: pyarrow backend nan_dtype = pandas.api.types.pandas_dtype(type(np.nan)) dtypes = None if func is not None: @@ -249,7 +245,7 @@ def try_compute_new_dtypes( infer_dtypes : {"common_cast", "try_sample", "bool", None}, default: None How dtypes should be infered (see ``Binary.register`` doc for more info). result_dtype : np.dtype, optional - NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter. Only NumPy? + NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter. axis : int, default: 0 Axis to perform the binary operation along. func : callable(pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame, optional @@ -264,19 +260,8 @@ def try_compute_new_dtypes( try: if infer_dtypes == "bool" or is_bool_dtype(result_dtype): - # dataframe can contain types of different backends at the same time, for example: - # (Pdb) (pandas.DataFrame([[1,2,3], [4,5,6]]).astype({0: "int64[pyarrow]"}) > 4).dtypes - # 0 bool[pyarrow] - # 1 bool - # 2 bool - # dtype: object - backend = "" - if any("pyarrow" in str(x) for x in first.dtypes) or any( - "pyarrow" in str(x) for x in second.dtypes - ): - backend = "[pyarrow]" dtypes = maybe_build_dtypes_series( - first, second, dtype=pandas.api.types.pandas_dtype(f"bool{backend}") + first, second, dtype=pandas.api.types.pandas_dtype(bool) ) elif infer_dtypes == "common_cast": dtypes = maybe_compute_dtypes_common_cast( diff --git a/modin/core/dataframe/algebra/fold.py b/modin/core/dataframe/algebra/fold.py index 9f6673a3e0a..419a0b56903 100644 --- a/modin/core/dataframe/algebra/fold.py +++ b/modin/core/dataframe/algebra/fold.py @@ -13,21 +13,14 @@ """Module houses builder class for Fold operator.""" -from __future__ import annotations - -from typing import TYPE_CHECKING, Callable - from .operator import Operator -if TYPE_CHECKING: - from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler - class Fold(Operator): """Builder class for Fold functions.""" @classmethod - def register(cls, fold_function) -> Callable[..., PandasQueryCompiler]: + def register(cls, fold_function): """ Build Fold operator that will be performed across rows/columns. @@ -42,9 +35,7 @@ def register(cls, fold_function) -> Callable[..., PandasQueryCompiler]: Function that takes query compiler and executes Fold function. """ - def caller( - query_compiler: PandasQueryCompiler, fold_axis=None, *args, **kwargs - ) -> PandasQueryCompiler: + def caller(query_compiler, fold_axis=None, *args, **kwargs): """ Execute Fold function against passed query compiler. diff --git a/modin/core/dataframe/algebra/map.py b/modin/core/dataframe/algebra/map.py index aefebe6c017..57b21f6e1b0 100644 --- a/modin/core/dataframe/algebra/map.py +++ b/modin/core/dataframe/algebra/map.py @@ -13,15 +13,8 @@ """Module houses builder class for Map operator.""" -from __future__ import annotations - -from typing import TYPE_CHECKING - from .operator import Operator -if TYPE_CHECKING: - from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler - class Map(Operator): """Builder class for Map operator.""" @@ -48,7 +41,7 @@ def register(cls, function, *call_args, **call_kwds): Function that takes query compiler and executes map function. """ - def caller(query_compiler: PandasQueryCompiler, *args, **kwargs): + def caller(query_compiler, *args, **kwargs): """Execute Map function against passed query compiler.""" shape_hint = call_kwds.pop("shape_hint", None) or query_compiler._shape_hint return query_compiler.__constructor__( diff --git a/modin/core/dataframe/algebra/tree_reduce.py b/modin/core/dataframe/algebra/tree_reduce.py index 8a30196cbeb..fa7b731e6f5 100644 --- a/modin/core/dataframe/algebra/tree_reduce.py +++ b/modin/core/dataframe/algebra/tree_reduce.py @@ -35,7 +35,7 @@ def register( axis : int, optional Specifies axis to apply function along. compute_dtypes : callable(pandas.Series, *func_args, **func_kwargs) -> np.dtype, optional - Callable for computing dtypes. Only NumPy? + Callable for computing dtypes. Returns ------- diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index fd882539c47..8059f1c988d 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -65,7 +65,11 @@ from modin.logging import ClassLogger from modin.logging.config import LogLevel from modin.pandas.indexing import is_range_like -from modin.pandas.utils import check_both_not_none, is_full_grab_slice +from modin.pandas.utils import ( + check_both_not_none, + get_pandas_backend, + is_full_grab_slice, +) from modin.utils import MODIN_UNNAMED_SERIES_LABEL @@ -136,8 +140,15 @@ def __init__( self.set_columns_cache(columns) self._row_lengths_cache = row_lengths self._column_widths_cache = column_widths - self.set_dtypes_cache(dtypes) self._pandas_backend = pandas_backend + if not pandas_backend == "pyarrow": + # In this case, the type precomputation may be incorrect; we need + # to know the type algebra precisely. Considering the number of operations + # and different combinations of backends, the best solution would be to + # introduce optimizations gradually, with a large number of tests. + self.set_dtypes_cache(dtypes) + else: + self.set_dtypes_cache(None) self._validate_axes_lengths() self._filter_empties(compute_metadata=False) @@ -406,6 +417,9 @@ def dtypes(self): else: dtypes = self._compute_dtypes() self.set_dtypes_cache(dtypes) + # During materialization, we can find out the backend and, if it + # is suitable, use the ability to pre-calculate types. + self._pandas_backend = get_pandas_backend(dtypes) return dtypes def get_dtypes_set(self): @@ -1321,7 +1335,6 @@ def _take_2d_positional( new_row_lengths, new_col_widths, new_dtypes, - # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) @@ -1499,7 +1512,6 @@ def from_labels_executor( row_lengths=self._row_lengths_cache, column_widths=new_column_widths, dtypes=new_dtypes, - # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) # Set flag for propagating deferred row labels across dataframe partitions @@ -1633,7 +1645,6 @@ def _reorder_labels(self, row_positions=None, col_positions=None): new_lengths, new_widths, new_dtypes, - # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) @@ -1654,7 +1665,6 @@ def copy(self): self._row_lengths_cache, self._column_widths_cache, self.copy_dtypes_cache(), - # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) @@ -1688,7 +1698,6 @@ def astype(self, col_dtypes, errors: str = "raise"): if new_dtypes is None: new_dtypes = self_dtypes.copy() # Update the new dtype series to the proper pandas dtype - # We don't need to add an implicit backend for `astype` new_dtype = pandas.api.types.pandas_dtype(dtype) if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 @@ -1718,7 +1727,6 @@ def astype_builder(df): # Assume that the dtype is a scalar. if not (col_dtypes == self_dtypes).all(): new_dtypes = self_dtypes.copy() - # We don't need to add an implicit backend for `astype` new_dtype = pandas.api.types.pandas_dtype(col_dtypes) if Engine.get() == "Dask" and hasattr(new_dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 @@ -1752,7 +1760,7 @@ def astype_builder(df): new_frame = self._partition_mgr_cls.lazy_map_partitions( self._partitions, astype_builder ) - # TODO: recompute _pandas_backend (it can be changed) + return self.__constructor__( new_frame, self.copy_index_cache(copy_lengths=True), @@ -1760,8 +1768,7 @@ def astype_builder(df): self._row_lengths_cache, self._column_widths_cache, new_dtypes, - # TODO: backend can be changed - pandas_backend=self._pandas_backend, + pandas_backend=get_pandas_backend(new_dtypes), ) def numeric_columns(self, include_bool=True): @@ -2113,7 +2120,7 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None): dtypes = self.copy_dtypes_cache() elif dtypes is not None: dtypes = pandas.Series( - [self.construct_dtype(dtypes, self._pandas_backend)] * len(new_axes[1]), + [pandas.api.types.pandas_dtype(dtypes)] * len(new_axes[1]), index=new_axes[1], ) @@ -2122,7 +2129,6 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None): *new_axes, *new_axes_lengths, dtypes, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) return result @@ -2299,7 +2305,7 @@ def map( # Materializing lazy columns in order to build dtype's index new_columns = new_columns.get(return_lengths=False) dtypes = pandas.Series( - [self.construct_dtype(dtypes, self._pandas_backend)] * len(new_columns), + [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns), index=new_columns, ) return self.__constructor__( @@ -2309,7 +2315,6 @@ def map( self._row_lengths_cache, self._column_widths_cache, dtypes=dtypes, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -2390,7 +2395,6 @@ def fold(self, axis, func, new_columns=None): self.copy_columns_cache(copy_lengths=True), self._row_lengths_cache, self._column_widths_cache, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -2438,7 +2442,6 @@ def infer_types(self, col_labels: List[str]) -> PandasDataframe: self._row_lengths_cache, self._column_widths_cache, new_dtypes, - # CHECKED: backend may be changed depending on `new_cols_dtypes` pandas_backend=self._pandas_backend, ) @@ -2545,7 +2548,6 @@ def combine_and_apply( self._row_lengths_cache, [len(self.columns)] if self.has_materialized_columns else None, self.copy_dtypes_cache(), - # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) else: @@ -2850,7 +2852,6 @@ def filter(self, axis: Union[Axis, int], condition: Callable) -> PandasDataframe *new_axes, *new_lengths, self.copy_dtypes_cache() if axis == Axis.COL_WISE else None, - # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) @@ -2910,7 +2911,6 @@ def explode(self, axis: Union[int, Axis], func: Callable) -> PandasDataframe: new_columns, row_lengths, column_widths, - # TODO: need check pandas_backend=self._pandas_backend, ) @@ -2939,7 +2939,6 @@ def combine(self) -> PandasDataframe: else None ), dtypes=self.copy_dtypes_cache(), - # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) result.synchronize_labels() @@ -3096,7 +3095,6 @@ def apply_full_axis_select_indices( None, None, dtypes=new_dtypes, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -3192,7 +3190,6 @@ def apply_select_indices( lengths_objs[0], lengths_objs[1], new_dtypes, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) else: @@ -3221,7 +3218,6 @@ def apply_select_indices( self._row_lengths_cache, self._column_widths_cache, new_dtypes, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -3328,7 +3324,6 @@ def _pick_axis(get_axis, sizes_cache): new_row_lengths, new_column_widths, dtypes=dtypes, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -3471,18 +3466,9 @@ def broadcast_apply_select_indices( new_partitions, index=new_index, columns=new_columns, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) - def construct_dtype(self, dtype: str, backend: Optional[str]): - if backend is None or dtype == "category": - return pandas.api.types.pandas_dtype(dtype) - elif backend == "pyarrow": - return pandas.api.types.pandas_dtype(f"{dtype}[{backend}]") - else: - raise NotImplementedError - @lazy_metadata_decorator(apply_axis="both") def broadcast_apply_full_axis( self, @@ -3596,17 +3582,14 @@ def broadcast_apply_full_axis( else: if new_columns is None: assert not is_list_like(dtypes) - # CHECKED: backend may be changed depending on function - dtype = self.construct_dtype(dtypes, self._pandas_backend) + dtype = pandas.api.types.pandas_dtype(dtypes) kw["dtypes"] = ModinDtypes(DtypesDescriptor(remaining_dtype=dtype)) else: kw["dtypes"] = ( pandas.Series(dtypes, index=new_columns) if is_list_like(dtypes) else pandas.Series( - # CHECKED: backend may be changed depending on function - [self.construct_dtype(dtypes, self._pandas_backend)] - * len(new_columns), + [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns), index=new_columns, ) ) @@ -3674,7 +3657,6 @@ def broadcast_apply_full_axis( index=new_index, columns=new_columns, **kw, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) if sync_labels and new_index is not None: @@ -3897,7 +3879,6 @@ def n_ary_op( self.copy_columns_cache(copy_lengths=True), row_lengths, self._column_widths_cache, - # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) new_right_frames = [ @@ -3907,7 +3888,6 @@ def n_ary_op( right_frame.copy_columns_cache(copy_lengths=True), row_lengths, right_frame._column_widths_cache, - # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) for right_parts, right_frame in zip(list_of_right_parts, right_frames) @@ -3946,7 +3926,6 @@ def n_ary_op( row_lengths, column_widths, dtypes, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -4081,7 +4060,6 @@ def _compute_new_widths(): new_lengths, new_widths, new_dtypes, - # CHECKED: backend preserved pandas_backend=self._pandas_backend, ) @@ -4157,7 +4135,6 @@ def _apply_func_to_range_partitioning_broadcast( index=new_index, columns=new_columns, dtypes=new_dtypes, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -4507,7 +4484,6 @@ def join_cols(df, *cols): new_partitions, index=result.copy_index_cache(), row_lengths=result._row_lengths_cache, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -4588,7 +4564,6 @@ def groupby_reduce( new_partitions, index=new_index, columns=new_columns, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) @@ -4774,7 +4749,6 @@ def transpose(self): self._column_widths_cache, self._row_lengths_cache, dtypes=new_dtypes, - # TODO: backend preserved? pandas_backend=self._pandas_backend, ) @@ -4963,7 +4937,6 @@ def remote_fn(df, name, caselist): # pragma: no cover columns, row_lengths, column_widths, - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) for part in list_of_right_parts @@ -5036,6 +5009,5 @@ def map_data( index=self.index, row_lengths=lengths, column_widths=[1], - # CHECKED: backend may be changed depending on function pandas_backend=self._pandas_backend, ) diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index 86b8ed5cd81..0f2e99cfb22 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -40,6 +40,7 @@ from modin.error_message import ErrorMessage from modin.logging import ClassLogger from modin.logging.config import LogLevel +from modin.pandas.utils import get_pandas_backend if TYPE_CHECKING: from modin.core.dataframe.pandas.dataframe.utils import ShuffleFunctions @@ -985,9 +986,7 @@ def update_bar(f): parts = cls.split_pandas_df_into_partitions( df, row_chunksize, col_chunksize, update_bar ) - backend = None - if any(isinstance(x, pandas.ArrowDtype) for x in df.dtypes): - backend = "pyarrow" + backend = get_pandas_backend(df.dtypes) if ProgressBar.get(): pbar.close() if not return_dims: diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index a84662c31f4..31e3feb7bdc 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -6754,10 +6754,7 @@ def case_when(self, caselist): # noqa: PR01, RT01, D200 ] return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist) - def construct_dtype(self, dtype: str, backend: Optional[str]): - return self._modin_frame.construct_dtype(dtype, backend) - - def get_backend(self) -> str: + def get_backend(self) -> Optional[str]: return self._modin_frame._pandas_backend def repartition(self, axis=None): diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 3a9ad6aa13d..6039baa2c7b 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -13,6 +13,8 @@ """Implement utils for pandas component.""" +from __future__ import annotations + from typing import Iterator, Optional, Tuple import numpy as np @@ -116,6 +118,13 @@ def is_scalar(obj): return not isinstance(obj, BasePandasDataset) and pandas_is_scalar(obj) +def get_pandas_backend(dtypes: pandas.Series) -> str | None: + backend = None + if any(isinstance(x, pandas.ArrowDtype) for x in dtypes): + backend = "pyarrow" + return backend + + def is_full_grab_slice(slc, sequence_len=None): """ Check that the passed slice grabs the whole sequence. From 5b18cfdce766a89bebf87b0fe26cea749c08cf39 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 01:23:09 +0200 Subject: [PATCH 32/50] cleanup Signed-off-by: Anatoly Myachev --- .../core/storage_formats/base/query_compiler.py | 1 - .../core/storage_formats/pandas/aggregations.py | 2 -- .../storage_formats/pandas/query_compiler.py | 1 - .../distributed/dataframe/pandas/partitions.py | 4 ++-- modin/logging/logger_decorator.py | 17 ++++------------- modin/numpy/indexing.py | 4 ++-- modin/pandas/base.py | 12 ++++++------ modin/pandas/dataframe.py | 3 --- modin/pandas/indexing.py | 4 ++-- 9 files changed, 16 insertions(+), 32 deletions(-) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 31e3feb7bdc..1d6cc719d17 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -4288,7 +4288,6 @@ def get_positions_from_labels(self, row_loc, col_loc): # `Index.get_indexer_for` works much faster with numpy arrays than with python lists, # so although we lose some time here on converting to numpy, `Index.get_indexer_for` # speedup covers the loss that we gain here. - # TODO: pyarrow backend? axis_loc = np.array(axis_loc, dtype=axis_labels.dtype) axis_lookup = axis_labels.get_indexer_for(axis_loc) # `Index.get_indexer_for` sets -1 value for missing labels, we have to verify whether diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py index 6c2e795a523..fd7d84f49d8 100644 --- a/modin/core/storage_formats/pandas/aggregations.py +++ b/modin/core/storage_formats/pandas/aggregations.py @@ -67,7 +67,6 @@ def corr_method( qc._modin_frame.copy_columns_cache(), ) new_dtypes = pandas.Series( - # TODO: pyarrow backend? np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)), index=new_columns, ) @@ -77,7 +76,6 @@ def corr_method( new_columns = old_dtypes[old_dtypes.map(is_numeric_dtype)].index new_index = new_columns.copy() new_dtypes = pandas.Series( - # TODO: pyarrow backend? np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)), index=new_columns, ) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 33e05a6c491..3df16fee11b 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -3088,7 +3088,6 @@ def mapper(df: pandas.DataFrame): ) # we have to keep other columns so setting their mask # values with `False` - # TODO: pyarrow backend? mask = pandas.Series( np.zeros(df.shape[1], dtype=bool), index=df.columns ) diff --git a/modin/distributed/dataframe/pandas/partitions.py b/modin/distributed/dataframe/pandas/partitions.py index 62a05ff81d3..cac3bec93b6 100644 --- a/modin/distributed/dataframe/pandas/partitions.py +++ b/modin/distributed/dataframe/pandas/partitions.py @@ -90,7 +90,7 @@ def unwrap_partitions( f"Only API Layer objects may be passed in here, got {type(api_layer_object)} instead." ) - modin_frame = api_layer_object._query_compiler._modin_frame + modin_frame = api_layer_object._query_compiler._modin_frame # type: ignore[attr-defined] modin_frame._propagate_index_objs(None) if axis is None: @@ -122,7 +122,7 @@ def get_block(partition: PartitionUnionType) -> np.ndarray: ] actual_engine = type( - api_layer_object._query_compiler._modin_frame._partitions[0][0] + api_layer_object._query_compiler._modin_frame._partitions[0][0] # type: ignore[attr-defined] ).__name__ if actual_engine in ( "PandasOnRayDataframePartition", diff --git a/modin/logging/logger_decorator.py b/modin/logging/logger_decorator.py index 662f7d1de73..301fb02562b 100644 --- a/modin/logging/logger_decorator.py +++ b/modin/logging/logger_decorator.py @@ -19,7 +19,7 @@ from functools import wraps from types import FunctionType, MethodType -from typing import Any, Callable, Dict, Optional, Tuple, Type, TypeVar, Union, overload +from typing import Any, Callable, Dict, Optional, Tuple, Type, Union from modin.config import LogMode @@ -28,9 +28,6 @@ _MODIN_LOGGER_NOWRAP = "__modin_logging_nowrap__" -Fn = TypeVar("Fn", bound=Callable) - - def disable_logging(func: Callable) -> Callable: """ Disable logging of one particular function. Useful for decorated classes. @@ -49,17 +46,11 @@ def disable_logging(func: Callable) -> Callable: return func -@overload -def enable_logging(modin_layer: Fn) -> Fn: - # This helps preserve typings when the decorator is used without parentheses - ... - - def enable_logging( - modin_layer: Union[str, Fn, Type] = "PANDAS-API", + modin_layer: Union[str, Callable, Type] = "PANDAS-API", name: Optional[str] = None, log_level: LogLevel = LogLevel.INFO, -) -> Callable[[Fn], Fn]: +) -> Callable: """ Log Decorator used on specific Modin functions or classes. @@ -85,7 +76,7 @@ def enable_logging( # def func() return enable_logging()(modin_layer) - def decorator(obj: Fn) -> Fn: + def decorator(obj: Any) -> Any: """Decorate function or class to add logs to Modin API function(s).""" if isinstance(obj, type): seen: Dict[Any, Any] = {} diff --git a/modin/numpy/indexing.py b/modin/numpy/indexing.py index 4223ae3e513..b598577a34d 100644 --- a/modin/numpy/indexing.py +++ b/modin/numpy/indexing.py @@ -214,7 +214,7 @@ def boolean_mask_to_numeric(indexer): # `itertools.compress` masks `data` with the `selectors` mask, # works about ~10% faster than a pure list comprehension itertools.compress(data=range(len(indexer)), selectors=indexer), - dtype="int64", + dtype=np.int64, ) @@ -585,7 +585,7 @@ def _compute_lookup(self, row_loc, col_loc): # `Index.__getitem__` works much faster with numpy arrays than with python lists, # so although we lose some time here on converting to numpy, `Index.__getitem__` # speedup covers the loss that we gain here. - axis_loc = np.array(axis_loc, dtype="int64") + axis_loc = np.array(axis_loc, dtype=np.int64) # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation # if there're no negative indices and so they don't not depend on the axis length. if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any(): diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 322af7debbe..656644f426e 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1534,7 +1534,7 @@ def eq(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get equality of `BasePandasDataset` and `other`, element-wise (binary operator `eq`). """ - return self._binary_op("eq", other, axis=axis, level=level, dtypes="bool") + return self._binary_op("eq", other, axis=axis, level=level, dtypes=np.bool_) def explode(self, column, ignore_index: bool = False): # noqa: PR01, RT01, D200 """ @@ -1835,7 +1835,7 @@ def ge(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get greater than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ge`). """ - return self._binary_op("ge", other, axis=axis, level=level, dtypes="bool") + return self._binary_op("ge", other, axis=axis, level=level, dtypes=np.bool_) def get(self, key, default=None): # noqa: PR01, RT01, D200 """ @@ -1851,7 +1851,7 @@ def gt(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get greater than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `gt`). """ - return self._binary_op("gt", other, axis=axis, level=level, dtypes="bool") + return self._binary_op("gt", other, axis=axis, level=level, dtypes=np.bool_) def head(self, n=5): # noqa: PR01, RT01, D200 """ @@ -1983,13 +1983,13 @@ def le(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get less than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `le`). """ - return self._binary_op("le", other, axis=axis, level=level, dtypes="bool") + return self._binary_op("le", other, axis=axis, level=level, dtypes=np.bool_) def lt(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get less than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `lt`). """ - return self._binary_op("lt", other, axis=axis, level=level, dtypes="bool") + return self._binary_op("lt", other, axis=axis, level=level, dtypes=np.bool_) @property def loc(self): # noqa: RT01, D200 @@ -2198,7 +2198,7 @@ def ne(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 """ Get Not equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ne`). """ - return self._binary_op("ne", other, axis=axis, level=level, dtypes="bool") + return self._binary_op("ne", other, axis=axis, level=level, dtypes=np.bool_) def notna(self): # noqa: RT01, D200 """ diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index b5ef0b1643d..53c68494249 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -3067,7 +3067,6 @@ def _validate_dtypes_min_max(self, axis, numeric_only) -> DataFrame: ): # check if there are columns with dtypes datetime or timedelta if all( - # TODO: pyarrow backend? dtype != pandas.api.types.pandas_dtype("datetime64[ns]") and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]") for dtype in self.dtypes @@ -3104,7 +3103,6 @@ def _validate_dtypes_sum_prod_mean( not axis and numeric_only is False and any( - # TODO: pyarrow backend? dtype == pandas.api.types.pandas_dtype("datetime64[ns]") for dtype in self.dtypes ) @@ -3125,7 +3123,6 @@ def _validate_dtypes_sum_prod_mean( ): # check if there are columns with dtypes datetime or timedelta if all( - # TODO: pyarrow backend? dtype != pandas.api.types.pandas_dtype("datetime64[ns]") and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]") for dtype in self.dtypes diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 316a75f82a7..d901b6dac99 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -232,7 +232,7 @@ def boolean_mask_to_numeric(indexer): # `itertools.compress` masks `data` with the `selectors` mask, # works about ~10% faster than a pure list comprehension itertools.compress(data=range(len(indexer)), selectors=indexer), - dtype="int64", + dtype=np.int64, ) @@ -1130,7 +1130,7 @@ def _compute_lookup(self, row_loc, col_loc): # `Index.__getitem__` works much faster with numpy arrays than with python lists, # so although we lose some time here on converting to numpy, `Index.__getitem__` # speedup covers the loss that we gain here. - axis_loc = np.array(axis_loc, dtype="int64") + axis_loc = np.array(axis_loc, dtype=np.int64) # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation # if there're no negative indices and so they don't not depend on the axis length. if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any(): From 9c6ce78c4990f87a04fc64449f5e1e5eb71125ce Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 01:37:31 +0200 Subject: [PATCH 33/50] cleanup Signed-off-by: Anatoly Myachev --- .../storage_formats/pandas/query_compiler.py | 166 +++++++++--------- 1 file changed, 84 insertions(+), 82 deletions(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 3df16fee11b..4ad25f5ace5 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -427,22 +427,22 @@ def to_numpy(self, **kwargs): combine_first = Binary.register( pandas.DataFrame.combine_first, infer_dtypes="common_cast" ) - eq = Binary.register(pandas.DataFrame.eq, infer_dtypes="bool") + eq = Binary.register(pandas.DataFrame.eq, infer_dtypes=np.bool_) equals = Binary.register( lambda df, other: pandas.DataFrame([[df.equals(other)]]), join_type=None, labels="drop", - infer_dtypes="bool", + infer_dtypes=np.bool_, ) floordiv = Binary.register(pandas.DataFrame.floordiv, infer_dtypes="try_sample") - ge = Binary.register(pandas.DataFrame.ge, infer_dtypes="bool") - gt = Binary.register(pandas.DataFrame.gt, infer_dtypes="bool") - le = Binary.register(pandas.DataFrame.le, infer_dtypes="bool") - lt = Binary.register(pandas.DataFrame.lt, infer_dtypes="bool") + ge = Binary.register(pandas.DataFrame.ge, infer_dtypes=np.bool_) + gt = Binary.register(pandas.DataFrame.gt, infer_dtypes=np.bool_) + le = Binary.register(pandas.DataFrame.le, infer_dtypes=np.bool_) + lt = Binary.register(pandas.DataFrame.lt, infer_dtypes=np.bool_) mod = Binary.register(pandas.DataFrame.mod, infer_dtypes="try_sample") mul = Binary.register(pandas.DataFrame.mul, infer_dtypes="try_sample") rmul = Binary.register(pandas.DataFrame.rmul, infer_dtypes="try_sample") - ne = Binary.register(pandas.DataFrame.ne, infer_dtypes="bool") + ne = Binary.register(pandas.DataFrame.ne, infer_dtypes=np.bool_) pow = Binary.register(pandas.DataFrame.pow, infer_dtypes="try_sample") radd = Binary.register(pandas.DataFrame.radd, infer_dtypes="try_sample") rfloordiv = Binary.register(pandas.DataFrame.rfloordiv, infer_dtypes="try_sample") @@ -452,12 +452,12 @@ def to_numpy(self, **kwargs): rtruediv = Binary.register(pandas.DataFrame.rtruediv, infer_dtypes="try_sample") sub = Binary.register(pandas.DataFrame.sub, infer_dtypes="try_sample") truediv = Binary.register(pandas.DataFrame.truediv, infer_dtypes="try_sample") - __and__ = Binary.register(pandas.DataFrame.__and__, infer_dtypes="bool") - __or__ = Binary.register(pandas.DataFrame.__or__, infer_dtypes="bool") - __rand__ = Binary.register(pandas.DataFrame.__rand__, infer_dtypes="bool") - __ror__ = Binary.register(pandas.DataFrame.__ror__, infer_dtypes="bool") - __rxor__ = Binary.register(pandas.DataFrame.__rxor__, infer_dtypes="bool") - __xor__ = Binary.register(pandas.DataFrame.__xor__, infer_dtypes="bool") + __and__ = Binary.register(pandas.DataFrame.__and__, infer_dtypes=np.bool_) + __or__ = Binary.register(pandas.DataFrame.__or__, infer_dtypes=np.bool_) + __rand__ = Binary.register(pandas.DataFrame.__rand__, infer_dtypes=np.bool_) + __ror__ = Binary.register(pandas.DataFrame.__ror__, infer_dtypes=np.bool_) + __rxor__ = Binary.register(pandas.DataFrame.__rxor__, infer_dtypes=np.bool_) + __xor__ = Binary.register(pandas.DataFrame.__xor__, infer_dtypes=np.bool_) df_update = Binary.register( copy_df_for_func(pandas.DataFrame.update, display_name="update"), join_type="left", @@ -475,19 +475,19 @@ def to_numpy(self, **kwargs): lambda df, other, *args, **kwargs: pandas.DataFrame( np.logical_and(df, other, *args, **kwargs) ), - infer_dtypes="bool", + infer_dtypes=np.bool_, ) _logical_or = Binary.register( lambda df, other, *args, **kwargs: pandas.DataFrame( np.logical_or(df, other, *args, **kwargs) ), - infer_dtypes="bool", + infer_dtypes=np.bool_, ) _logical_xor = Binary.register( lambda df, other, *args, **kwargs: pandas.DataFrame( np.logical_xor(df, other, *args, **kwargs) ), - infer_dtypes="bool", + infer_dtypes=np.bool_, ) def where(self, cond, other, **kwargs): @@ -943,7 +943,7 @@ def compute_dtypes_fn(dtypes, axis, **kwargs): and any(is_bool_dtype(t) for t in dtypes) and any(is_numeric_dtype(t) for t in dtypes) ): - return "object" + return np.object_ # how to take into account backend here? return "float64" @@ -1850,7 +1850,7 @@ def isin_func(df, values): ) return res - return Map.register(isin_func, shape_hint=shape_hint, dtypes="bool")( + return Map.register(isin_func, shape_hint=shape_hint, dtypes=np.bool_)( self, values ) @@ -1882,37 +1882,37 @@ def convert_dtypes( return result invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy") - isna = Map.register(pandas.DataFrame.isna, dtypes="bool") + isna = Map.register(pandas.DataFrame.isna, dtypes=np.bool_) # better way to distinguish methods for NumPy API? _isfinite = Map.register( lambda df, *args, **kwargs: pandas.DataFrame(np.isfinite(df, *args, **kwargs)), - dtypes="bool", + dtypes=np.bool_, ) _isinf = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isinf(df, *args, **kwargs)), - dtypes="bool", + dtypes=np.bool_, ) _isnat = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isnat(df, *args, **kwargs)), - dtypes="bool", + dtypes=np.bool_, ) _isneginf = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isneginf(df, *args, **kwargs)), - dtypes="bool", + dtypes=np.bool_, ) _isposinf = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isposinf(df, *args, **kwargs)), - dtypes="bool", + dtypes=np.bool_, ) _iscomplex = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.iscomplex(df, *args, **kwargs)), - dtypes="bool", + dtypes=np.bool_, ) _isreal = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isreal(df, *args, **kwargs)), - dtypes="bool", + dtypes=np.bool_, ) - _logical_not = Map.register(np.logical_not, dtypes="bool") # Needed for numpy API + _logical_not = Map.register(np.logical_not, dtypes=np.bool_) # Needed for numpy API _tanh = Map.register( lambda df, *args, **kwargs: pandas.DataFrame(np.tanh(df, *args, **kwargs)) ) # Needed for numpy API @@ -1923,7 +1923,7 @@ def convert_dtypes( lambda df, *args, **kwargs: pandas.DataFrame(np.exp(df, *args, **kwargs)) ) # Needed for numpy API negative = Map.register(pandas.DataFrame.__neg__) - notna = Map.register(pandas.DataFrame.notna, dtypes="bool") + notna = Map.register(pandas.DataFrame.notna, dtypes=np.bool_) round = Map.register(pandas.DataFrame.round) replace = Map.register(pandas.DataFrame.replace) series_view = Map.register( @@ -1949,24 +1949,24 @@ def convert_dtypes( str_capitalize = Map.register(_str_map("capitalize"), dtypes="copy") str_center = Map.register(_str_map("center"), dtypes="copy") - str_contains = Map.register(_str_map("contains"), dtypes="bool") - str_count = Map.register(_str_map("count"), dtypes="int64") - str_endswith = Map.register(_str_map("endswith"), dtypes="bool") - str_find = Map.register(_str_map("find"), dtypes="int64") + str_contains = Map.register(_str_map("contains"), dtypes=np.bool_) + str_count = Map.register(_str_map("count"), dtypes=np.int64) + str_endswith = Map.register(_str_map("endswith"), dtypes=np.bool_) + str_find = Map.register(_str_map("find"), dtypes=np.int64) str_findall = Map.register(_str_map("findall"), dtypes="copy") str_get = Map.register(_str_map("get"), dtypes="copy") - str_index = Map.register(_str_map("index"), dtypes="int64") - str_isalnum = Map.register(_str_map("isalnum"), dtypes="bool") - str_isalpha = Map.register(_str_map("isalpha"), dtypes="bool") - str_isdecimal = Map.register(_str_map("isdecimal"), dtypes="bool") - str_isdigit = Map.register(_str_map("isdigit"), dtypes="bool") - str_islower = Map.register(_str_map("islower"), dtypes="bool") - str_isnumeric = Map.register(_str_map("isnumeric"), dtypes="bool") - str_isspace = Map.register(_str_map("isspace"), dtypes="bool") - str_istitle = Map.register(_str_map("istitle"), dtypes="bool") - str_isupper = Map.register(_str_map("isupper"), dtypes="bool") + str_index = Map.register(_str_map("index"), dtypes=np.int64) + str_isalnum = Map.register(_str_map("isalnum"), dtypes=np.bool_) + str_isalpha = Map.register(_str_map("isalpha"), dtypes=np.bool_) + str_isdecimal = Map.register(_str_map("isdecimal"), dtypes=np.bool_) + str_isdigit = Map.register(_str_map("isdigit"), dtypes=np.bool_) + str_islower = Map.register(_str_map("islower"), dtypes=np.bool_) + str_isnumeric = Map.register(_str_map("isnumeric"), dtypes=np.bool_) + str_isspace = Map.register(_str_map("isspace"), dtypes=np.bool_) + str_istitle = Map.register(_str_map("istitle"), dtypes=np.bool_) + str_isupper = Map.register(_str_map("isupper"), dtypes=np.bool_) str_join = Map.register(_str_map("join"), dtypes="copy") - str_len = Map.register(_str_map("len"), dtypes="int64") + str_len = Map.register(_str_map("len"), dtypes=np.int64) str_ljust = Map.register(_str_map("ljust"), dtypes="copy") str_lower = Map.register(_str_map("lower"), dtypes="copy") str_lstrip = Map.register(_str_map("lstrip"), dtypes="copy") @@ -1995,8 +1995,8 @@ def str_extract(self, pat, flags, expand): return qc str_replace = Map.register(_str_map("replace"), dtypes="copy", shape_hint="column") - str_rfind = Map.register(_str_map("rfind"), dtypes="int64", shape_hint="column") - str_rindex = Map.register(_str_map("rindex"), dtypes="int64", shape_hint="column") + str_rfind = Map.register(_str_map("rfind"), dtypes=np.int64, shape_hint="column") + str_rindex = Map.register(_str_map("rindex"), dtypes=np.int64, shape_hint="column") str_rjust = Map.register(_str_map("rjust"), dtypes="copy", shape_hint="column") _str_rpartition = Map.register( _str_map("rpartition"), dtypes="copy", shape_hint="column" @@ -2030,7 +2030,7 @@ def str_split(self, pat=None, n=-1, expand=False, regex=None): return self._str_split(pat=pat, n=n, expand=False, regex=regex) str_startswith = Map.register( - _str_map("startswith"), dtypes="bool", shape_hint="column" + _str_map("startswith"), dtypes=np.bool_, shape_hint="column" ) str_strip = Map.register(_str_map("strip"), dtypes="copy", shape_hint="column") str_swapcase = Map.register( @@ -2102,49 +2102,51 @@ def searchsorted(df): # Dt map partitions operations - dt_date = Map.register(_dt_prop_map("date"), dtypes="object") - dt_time = Map.register(_dt_prop_map("time"), dtypes="object") - dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes="object") + dt_date = Map.register(_dt_prop_map("date"), dtypes=np.object_) + dt_time = Map.register(_dt_prop_map("time"), dtypes=np.object_) + dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes=np.object_) dt_year = Map.register(_dt_prop_map("year"), dtypes="int32") dt_month = Map.register(_dt_prop_map("month"), dtypes="int32") dt_day = Map.register(_dt_prop_map("day"), dtypes="int32") - dt_hour = Map.register(_dt_prop_map("hour"), dtypes="int64") - dt_minute = Map.register(_dt_prop_map("minute"), dtypes="int64") - dt_second = Map.register(_dt_prop_map("second"), dtypes="int64") - dt_microsecond = Map.register(_dt_prop_map("microsecond"), dtypes="int64") - dt_nanosecond = Map.register(_dt_prop_map("nanosecond"), dtypes="int64") - dt_dayofweek = Map.register(_dt_prop_map("dayofweek"), dtypes="int64") - dt_weekday = Map.register(_dt_prop_map("weekday"), dtypes="int64") - dt_dayofyear = Map.register(_dt_prop_map("dayofyear"), dtypes="int64") - dt_quarter = Map.register(_dt_prop_map("quarter"), dtypes="int64") - dt_is_month_start = Map.register(_dt_prop_map("is_month_start"), dtypes="bool") - dt_is_month_end = Map.register(_dt_prop_map("is_month_end"), dtypes="bool") - dt_is_quarter_start = Map.register(_dt_prop_map("is_quarter_start"), dtypes="bool") - dt_is_quarter_end = Map.register(_dt_prop_map("is_quarter_end"), dtypes="bool") - dt_is_year_start = Map.register(_dt_prop_map("is_year_start"), dtypes="bool") - dt_is_year_end = Map.register(_dt_prop_map("is_year_end"), dtypes="bool") - dt_is_leap_year = Map.register(_dt_prop_map("is_leap_year"), dtypes="bool") - dt_daysinmonth = Map.register(_dt_prop_map("daysinmonth"), dtypes="int64") - dt_days_in_month = Map.register(_dt_prop_map("days_in_month"), dtypes="int64") + dt_hour = Map.register(_dt_prop_map("hour"), dtypes=np.int64) + dt_minute = Map.register(_dt_prop_map("minute"), dtypes=np.int64) + dt_second = Map.register(_dt_prop_map("second"), dtypes=np.int64) + dt_microsecond = Map.register(_dt_prop_map("microsecond"), dtypes=np.int64) + dt_nanosecond = Map.register(_dt_prop_map("nanosecond"), dtypes=np.int64) + dt_dayofweek = Map.register(_dt_prop_map("dayofweek"), dtypes=np.int64) + dt_weekday = Map.register(_dt_prop_map("weekday"), dtypes=np.int64) + dt_dayofyear = Map.register(_dt_prop_map("dayofyear"), dtypes=np.int64) + dt_quarter = Map.register(_dt_prop_map("quarter"), dtypes=np.int64) + dt_is_month_start = Map.register(_dt_prop_map("is_month_start"), dtypes=np.bool_) + dt_is_month_end = Map.register(_dt_prop_map("is_month_end"), dtypes=np.bool_) + dt_is_quarter_start = Map.register( + _dt_prop_map("is_quarter_start"), dtypes=np.bool_ + ) + dt_is_quarter_end = Map.register(_dt_prop_map("is_quarter_end"), dtypes=np.bool_) + dt_is_year_start = Map.register(_dt_prop_map("is_year_start"), dtypes=np.bool_) + dt_is_year_end = Map.register(_dt_prop_map("is_year_end"), dtypes=np.bool_) + dt_is_leap_year = Map.register(_dt_prop_map("is_leap_year"), dtypes=np.bool_) + dt_daysinmonth = Map.register(_dt_prop_map("daysinmonth"), dtypes=np.int64) + dt_days_in_month = Map.register(_dt_prop_map("days_in_month"), dtypes=np.int64) dt_asfreq = Map.register(_dt_func_map("asfreq")) dt_to_period = Map.register(_dt_func_map("to_period")) - dt_to_pydatetime = Map.register(_dt_func_map("to_pydatetime"), dtypes="object") + dt_to_pydatetime = Map.register(_dt_func_map("to_pydatetime"), dtypes=np.object_) dt_tz_localize = Map.register(_dt_func_map("tz_localize")) dt_tz_convert = Map.register(_dt_func_map("tz_convert")) dt_normalize = Map.register(_dt_func_map("normalize")) - dt_strftime = Map.register(_dt_func_map("strftime"), dtypes="object") + dt_strftime = Map.register(_dt_func_map("strftime"), dtypes=np.object_) dt_round = Map.register(_dt_func_map("round")) dt_floor = Map.register(_dt_func_map("floor")) dt_ceil = Map.register(_dt_func_map("ceil")) - dt_month_name = Map.register(_dt_func_map("month_name"), dtypes="object") - dt_day_name = Map.register(_dt_func_map("day_name"), dtypes="object") - dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes="object") + dt_month_name = Map.register(_dt_func_map("month_name"), dtypes=np.object_) + dt_day_name = Map.register(_dt_func_map("day_name"), dtypes=np.object_) + dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes=np.object_) dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes="float64") - dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes="int64") - dt_days = Map.register(_dt_prop_map("days"), dtypes="int64") - dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes="int64") - dt_nanoseconds = Map.register(_dt_prop_map("nanoseconds"), dtypes="int64") - dt_qyear = Map.register(_dt_prop_map("qyear"), dtypes="int64") + dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes=np.int64) + dt_days = Map.register(_dt_prop_map("days"), dtypes=np.int64) + dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes=np.int64) + dt_nanoseconds = Map.register(_dt_prop_map("nanoseconds"), dtypes=np.int64) + dt_qyear = Map.register(_dt_prop_map("qyear"), dtypes=np.int64) dt_start_time = Map.register(_dt_prop_map("start_time")) dt_end_time = Map.register(_dt_prop_map("end_time")) dt_to_timestamp = Map.register(_dt_func_map("to_timestamp")) @@ -3190,7 +3192,7 @@ def _compute_duplicated(df): # pragma: no cover axis=1, function=_compute_hash, # TODO: pyarrow backend - dtypes="object", + dtypes=np.object_, ) else: hashed_modin_frame = self._modin_frame @@ -3199,7 +3201,7 @@ def _compute_duplicated(df): # pragma: no cover func=_compute_duplicated, new_index=self._modin_frame.copy_index_cache(), new_columns=[MODIN_UNNAMED_SERIES_LABEL], - dtypes="bool", + dtypes=np.bool_, keep_partitioning=True, ) return self.__constructor__(new_modin_frame, shape_hint="column") @@ -3626,7 +3628,7 @@ def groupby_mean(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=Fals ) qc_with_converted_datetime_cols = ( - self.astype({col: "int64" for col in datetime_cols.keys()}) + self.astype({col: np.int64 for col in datetime_cols.keys()}) if len(datetime_cols) > 0 else self ) @@ -4480,13 +4482,13 @@ def map_fn(df): # pragma: no cover if len(columns) == len(self.columns): # TODO: pyarrow backend new_modin_frame = self._modin_frame.apply_full_axis( - 0, map_fn, new_index=self.index, dtypes="bool" + 0, map_fn, new_index=self.index, dtypes=np.bool_ ) untouched_frame = None else: new_modin_frame = self._modin_frame.take_2d_labels_or_positional( col_labels=columns - ).apply_full_axis(0, map_fn, new_index=self.index, dtypes="bool") + ).apply_full_axis(0, map_fn, new_index=self.index, dtypes=np.bool_) untouched_frame = self.drop(columns=columns) # If we mapped over all the data we are done. If not, we need to # prepend the `new_modin_frame` with the raw data from the columns that were From a04b0a2b958f57b0548b4fa5aa9bd84ad8b1b797 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 02:29:51 +0200 Subject: [PATCH 34/50] cleanup Signed-off-by: Anatoly Myachev --- modin/core/storage_formats/base/query_compiler.py | 8 +------- modin/core/storage_formats/pandas/query_compiler.py | 10 +++++----- modin/pandas/dataframe.py | 4 ++-- modin/tests/pandas/utils.py | 10 +--------- 4 files changed, 9 insertions(+), 23 deletions(-) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 1d6cc719d17..6258d43503a 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -22,7 +22,7 @@ import abc import warnings from functools import cached_property -from typing import TYPE_CHECKING, Hashable, List, Optional +from typing import Hashable, List, Optional import numpy as np import pandas @@ -53,10 +53,6 @@ from . import doc_utils -if TYPE_CHECKING: - # TODO: should be ModinDataframe - from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe - def _get_axis(axis): """ @@ -131,8 +127,6 @@ class BaseQueryCompiler( for a list of requirements for subclassing this object. """ - _modin_frame: PandasDataframe - def __wrap_in_qc(self, obj): """ Wrap `obj` in query compiler. diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 4ad25f5ace5..9dfd15a69bf 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -945,7 +945,7 @@ def compute_dtypes_fn(dtypes, axis, **kwargs): ): return np.object_ # how to take into account backend here? - return "float64" + return np.float64 return TreeReduce.register( map_fn, @@ -2141,7 +2141,7 @@ def searchsorted(df): dt_month_name = Map.register(_dt_func_map("month_name"), dtypes=np.object_) dt_day_name = Map.register(_dt_func_map("day_name"), dtypes=np.object_) dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes=np.object_) - dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes="float64") + dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes=np.float64) dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes=np.int64) dt_days = Map.register(_dt_prop_map("days"), dtypes=np.int64) dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes=np.int64) @@ -2323,7 +2323,7 @@ def map_func(df): # pragma: no cover # Does it work with pyarrow backend? df_mask = np.isfinite(df) - result = np.empty((n_rows, n_cols), dtype="float64") + result = np.empty((n_rows, n_cols), dtype=np.float64) for i in range(n_rows): df_ith_row = df[i] @@ -2679,7 +2679,7 @@ def quantile_builder(df, **kwargs): lambda df: quantile_builder(df, **kwargs), new_index=q_index, new_columns=new_columns, - dtypes="float64", + dtypes=np.float64, ) result = self.__constructor__(new_modin_frame) return result.transpose() if axis == 1 else result @@ -2696,7 +2696,7 @@ def rank(self, **kwargs): if not numeric_only else None ), - dtypes="float64", + dtypes=np.float64, sync_labels=False, ) return self.__constructor__(new_modin_frame) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 53c68494249..0d00f614454 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1623,7 +1623,7 @@ def prod( and numeric_only is False and min_count > len(axis_to_apply) # Type inference is not so simple for pyarrow - and self._query_compiler.get_backend() == "default" + and self._query_compiler.get_backend() is not None ): new_index = self.columns if not axis else self.index # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10) @@ -2153,7 +2153,7 @@ def sum( and numeric_only is False and min_count > len(axis_to_apply) # Type inference is not so simple for pyarrow - and self._query_compiler.get_backend() == "default" + and self._query_compiler.get_backend() is not None ): new_index = self.columns if not axis else self.index return Series( diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py index b04026393a9..1b085cb6614 100644 --- a/modin/tests/pandas/utils.py +++ b/modin/tests/pandas/utils.py @@ -1088,11 +1088,7 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs): def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, pandas.DataFrame]: - post_fn = kwargs.pop("post_fn", None) - - if post_fn is None: - # TODO: REVERT ME - post_fn = lambda df: df.convert_dtypes(dtype_backend="pyarrow") # noqa: E731 + post_fn = kwargs.pop("post_fn", lambda df: df) return tuple( map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)]) ) @@ -1108,10 +1104,6 @@ def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Se if sort: modin_series = modin_series.sort_values().reset_index(drop=True) pandas_series = pandas_series.sort_values().reset_index(drop=True) - - # TODO: REVERT ME - modin_series = modin_series.convert_dtypes(dtype_backend="pyarrow") - pandas_series = pandas_series.convert_dtypes(dtype_backend="pyarrow") return modin_series, pandas_series From 60101b511522382771deca8d3452f4b93794e0f0 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 02:39:43 +0200 Subject: [PATCH 35/50] fix Signed-off-by: Anatoly Myachev --- .../storage_formats/pandas/query_compiler.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 9dfd15a69bf..afe1688a831 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -427,22 +427,22 @@ def to_numpy(self, **kwargs): combine_first = Binary.register( pandas.DataFrame.combine_first, infer_dtypes="common_cast" ) - eq = Binary.register(pandas.DataFrame.eq, infer_dtypes=np.bool_) + eq = Binary.register(pandas.DataFrame.eq, infer_dtypes="bool") equals = Binary.register( lambda df, other: pandas.DataFrame([[df.equals(other)]]), join_type=None, labels="drop", - infer_dtypes=np.bool_, + infer_dtypes="bool", ) floordiv = Binary.register(pandas.DataFrame.floordiv, infer_dtypes="try_sample") - ge = Binary.register(pandas.DataFrame.ge, infer_dtypes=np.bool_) - gt = Binary.register(pandas.DataFrame.gt, infer_dtypes=np.bool_) - le = Binary.register(pandas.DataFrame.le, infer_dtypes=np.bool_) - lt = Binary.register(pandas.DataFrame.lt, infer_dtypes=np.bool_) + ge = Binary.register(pandas.DataFrame.ge, infer_dtypes="bool") + gt = Binary.register(pandas.DataFrame.gt, infer_dtypes="bool") + le = Binary.register(pandas.DataFrame.le, infer_dtypes="bool") + lt = Binary.register(pandas.DataFrame.lt, infer_dtypes="bool") mod = Binary.register(pandas.DataFrame.mod, infer_dtypes="try_sample") mul = Binary.register(pandas.DataFrame.mul, infer_dtypes="try_sample") rmul = Binary.register(pandas.DataFrame.rmul, infer_dtypes="try_sample") - ne = Binary.register(pandas.DataFrame.ne, infer_dtypes=np.bool_) + ne = Binary.register(pandas.DataFrame.ne, infer_dtypes="bool") pow = Binary.register(pandas.DataFrame.pow, infer_dtypes="try_sample") radd = Binary.register(pandas.DataFrame.radd, infer_dtypes="try_sample") rfloordiv = Binary.register(pandas.DataFrame.rfloordiv, infer_dtypes="try_sample") @@ -452,12 +452,12 @@ def to_numpy(self, **kwargs): rtruediv = Binary.register(pandas.DataFrame.rtruediv, infer_dtypes="try_sample") sub = Binary.register(pandas.DataFrame.sub, infer_dtypes="try_sample") truediv = Binary.register(pandas.DataFrame.truediv, infer_dtypes="try_sample") - __and__ = Binary.register(pandas.DataFrame.__and__, infer_dtypes=np.bool_) - __or__ = Binary.register(pandas.DataFrame.__or__, infer_dtypes=np.bool_) - __rand__ = Binary.register(pandas.DataFrame.__rand__, infer_dtypes=np.bool_) - __ror__ = Binary.register(pandas.DataFrame.__ror__, infer_dtypes=np.bool_) - __rxor__ = Binary.register(pandas.DataFrame.__rxor__, infer_dtypes=np.bool_) - __xor__ = Binary.register(pandas.DataFrame.__xor__, infer_dtypes=np.bool_) + __and__ = Binary.register(pandas.DataFrame.__and__, infer_dtypes="bool") + __or__ = Binary.register(pandas.DataFrame.__or__, infer_dtypes="bool") + __rand__ = Binary.register(pandas.DataFrame.__rand__, infer_dtypes="bool") + __ror__ = Binary.register(pandas.DataFrame.__ror__, infer_dtypes="bool") + __rxor__ = Binary.register(pandas.DataFrame.__rxor__, infer_dtypes="bool") + __xor__ = Binary.register(pandas.DataFrame.__xor__, infer_dtypes="bool") df_update = Binary.register( copy_df_for_func(pandas.DataFrame.update, display_name="update"), join_type="left", @@ -475,19 +475,19 @@ def to_numpy(self, **kwargs): lambda df, other, *args, **kwargs: pandas.DataFrame( np.logical_and(df, other, *args, **kwargs) ), - infer_dtypes=np.bool_, + infer_dtypes="bool", ) _logical_or = Binary.register( lambda df, other, *args, **kwargs: pandas.DataFrame( np.logical_or(df, other, *args, **kwargs) ), - infer_dtypes=np.bool_, + infer_dtypes="bool", ) _logical_xor = Binary.register( lambda df, other, *args, **kwargs: pandas.DataFrame( np.logical_xor(df, other, *args, **kwargs) ), - infer_dtypes=np.bool_, + infer_dtypes="bool", ) def where(self, cond, other, **kwargs): From c25a41907e10e589901fce503287dbfd63458e6d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 12:20:09 +0200 Subject: [PATCH 36/50] cleanup Signed-off-by: Anatoly Myachev --- .../storage_formats/pandas/query_compiler.py | 22 ++++++++----------- modin/tests/pandas/dataframe/test_binary.py | 2 -- modin/tests/pandas/dataframe/test_default.py | 4 +--- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index a7332e5e590..228a8b8ab38 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -1950,7 +1950,7 @@ def convert_dtypes( str_capitalize = Map.register(_str_map("capitalize"), dtypes="copy") str_center = Map.register(_str_map("center"), dtypes="copy") str_contains = Map.register(_str_map("contains"), dtypes=np.bool_) - str_count = Map.register(_str_map("count"), dtypes=np.int64) + str_count = Map.register(_str_map("count"), dtypes=int) str_endswith = Map.register(_str_map("endswith"), dtypes=np.bool_) str_find = Map.register(_str_map("find"), dtypes=np.int64) str_findall = Map.register(_str_map("findall"), dtypes="copy") @@ -1966,7 +1966,7 @@ def convert_dtypes( str_istitle = Map.register(_str_map("istitle"), dtypes=np.bool_) str_isupper = Map.register(_str_map("isupper"), dtypes=np.bool_) str_join = Map.register(_str_map("join"), dtypes="copy") - str_len = Map.register(_str_map("len"), dtypes=np.int64) + str_len = Map.register(_str_map("len"), dtypes=int) str_ljust = Map.register(_str_map("ljust"), dtypes="copy") str_lower = Map.register(_str_map("lower"), dtypes="copy") str_lstrip = Map.register(_str_map("lstrip"), dtypes="copy") @@ -2105,9 +2105,9 @@ def searchsorted(df): dt_date = Map.register(_dt_prop_map("date"), dtypes=np.object_) dt_time = Map.register(_dt_prop_map("time"), dtypes=np.object_) dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes=np.object_) - dt_year = Map.register(_dt_prop_map("year"), dtypes="int32") - dt_month = Map.register(_dt_prop_map("month"), dtypes="int32") - dt_day = Map.register(_dt_prop_map("day"), dtypes="int32") + dt_year = Map.register(_dt_prop_map("year"), dtypes=np.int32) + dt_month = Map.register(_dt_prop_map("month"), dtypes=np.int32) + dt_day = Map.register(_dt_prop_map("day"), dtypes=np.int32) dt_hour = Map.register(_dt_prop_map("hour"), dtypes=np.int64) dt_minute = Map.register(_dt_prop_map("minute"), dtypes=np.int64) dt_second = Map.register(_dt_prop_map("second"), dtypes=np.int64) @@ -2158,7 +2158,6 @@ def astype(self, col_dtypes, errors: str = "raise"): # other query compilers may not take care of error handling at the API # layer. This query compiler assumes there won't be any errors due to # invalid type keys. - # Function that can change the backend return self.__constructor__( self._modin_frame.astype(col_dtypes, errors=errors), shape_hint=self._shape_hint, @@ -2320,7 +2319,6 @@ def map_func(df): # pragma: no cover """Compute covariance or correlation matrix for the passed frame.""" df = df.to_numpy() n_rows = df.shape[0] - # Does it work with pyarrow backend? df_mask = np.isfinite(df) result = np.empty((n_rows, n_cols), dtype=np.float64) @@ -3191,8 +3189,7 @@ def _compute_duplicated(df): # pragma: no cover hashed_modin_frame = self._modin_frame.reduce( axis=1, function=_compute_hash, - # TODO: pyarrow backend - dtypes=np.object_, + dtypes=pandas.api.types.pandas_dtype("O"), ) else: hashed_modin_frame = self._modin_frame @@ -3628,7 +3625,7 @@ def groupby_mean(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=Fals ) qc_with_converted_datetime_cols = ( - self.astype({col: np.int64 for col in datetime_cols.keys()}) + self.astype({col: "int64" for col in datetime_cols.keys()}) if len(datetime_cols) > 0 else self ) @@ -4480,15 +4477,14 @@ def map_fn(df): # pragma: no cover # efficient if we are mapping over all of the data to do it this way # than it would be to reuse the code for specific columns. if len(columns) == len(self.columns): - # TODO: pyarrow backend new_modin_frame = self._modin_frame.apply_full_axis( - 0, map_fn, new_index=self.index, dtypes=np.bool_ + 0, map_fn, new_index=self.index, dtypes=bool ) untouched_frame = None else: new_modin_frame = self._modin_frame.take_2d_labels_or_positional( col_labels=columns - ).apply_full_axis(0, map_fn, new_index=self.index, dtypes=np.bool_) + ).apply_full_axis(0, map_fn, new_index=self.index, dtypes=bool) untouched_frame = self.drop(columns=columns) # If we mapped over all the data we are done. If not, we need to # prepend the `new_modin_frame` with the raw data from the columns that were diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index a1070d892b7..0a8aa80d6d3 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -472,8 +472,6 @@ def test_non_commutative_multiply(): eval_general(modin_df, pandas_df, lambda s: s * integer) -# TODO: just for developing purpose; remove `skip` mark -@pytest.mark.skip @pytest.mark.parametrize( "op", [ diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 173e90e8762..76af5a110c9 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -528,8 +528,6 @@ def test_info(data, verbose, max_cols, memory_usage, show_counts): assert modin_info[1:] == pandas_info[1:] -# TODO: just for developing purpose; remove `xfail` mark -@pytest.mark.xfail @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("numeric_only", [False, True]) @@ -718,7 +716,7 @@ def test_pivot_table_data(data, index, columns, values, aggfunc, request): "callable_tree_reduce_func" in request.node.callspec.id and "int_data" in request.node.callspec.id ): - expected_exception = TypeError("'float' object is not callable") + expected_exception = TypeError("'numpy.float64' object is not callable") eval_general( md_df, From 6e0c37e4b97c2a3d043f9b192392125ebfd9047b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 13:17:56 +0200 Subject: [PATCH 37/50] fixes Signed-off-by: Anatoly Myachev --- .../storage_formats/pandas/query_compiler.py | 5 ++--- modin/tests/pandas/utils.py | 21 ++++++++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 228a8b8ab38..358ae635d47 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -943,9 +943,8 @@ def compute_dtypes_fn(dtypes, axis, **kwargs): and any(is_bool_dtype(t) for t in dtypes) and any(is_numeric_dtype(t) for t in dtypes) ): - return np.object_ - # how to take into account backend here? - return np.float64 + return "object" + return "float64" return TreeReduce.register( map_fn, diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py index 227c438ebba..1949097ce4f 100644 --- a/modin/tests/pandas/utils.py +++ b/modin/tests/pandas/utils.py @@ -1087,14 +1087,25 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs): ) -def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, pandas.DataFrame]: - post_fn = kwargs.pop("post_fn", lambda df: df) +def create_test_dfs( + *args, post_fn=None, backend=None, **kwargs +) -> tuple[pd.DataFrame, pandas.DataFrame]: + if post_fn is None: + post_fn = lambda df: ( # noqa: E731 + df.convert_dtypes(dtype_backend=backend) if backend is not None else df + ) + elif backend is not None: + post_fn = lambda df: post_fn(df).convert_dtypes( # noqa: E731 + dtype_backend=backend + ) return tuple( map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)]) ) -def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Series]: +def create_test_series( + vals, sort=False, backend=None, **kwargs +) -> tuple[pd.Series, pandas.Series]: if isinstance(vals, dict): modin_series = pd.Series(vals[next(iter(vals.keys()))], **kwargs) pandas_series = pandas.Series(vals[next(iter(vals.keys()))], **kwargs) @@ -1104,6 +1115,10 @@ def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Se if sort: modin_series = modin_series.sort_values().reset_index(drop=True) pandas_series = pandas_series.sort_values().reset_index(drop=True) + + if backend is not None: + modin_series = modin_series.convert_dtypes(dtype_backend=backend) + pandas_series = pandas_series.convert_dtypes(dtype_backend=backend) return modin_series, pandas_series From 778be0204bd323ada8d9c365ec708e1b421901a1 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 15:22:46 +0200 Subject: [PATCH 38/50] cleanup Signed-off-by: Anatoly Myachev --- modin/core/dataframe/base/dataframe/utils.py | 6 +++++- modin/tests/pandas/dataframe/test_default.py | 4 +++- modin/tests/pandas/dataframe/test_map_metadata.py | 10 +++------- modin/tests/pandas/dataframe/test_reduce.py | 3 +-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/modin/core/dataframe/base/dataframe/utils.py b/modin/core/dataframe/base/dataframe/utils.py index c8e3f742193..66657ae76cb 100644 --- a/modin/core/dataframe/base/dataframe/utils.py +++ b/modin/core/dataframe/base/dataframe/utils.py @@ -24,6 +24,7 @@ import pandas from pandas._typing import IndexLabel from pandas.api.types import is_scalar +from pandas.core.dtypes.common import is_float_dtype, is_numeric_dtype class Axis(Enum): # noqa: PR01 @@ -169,7 +170,10 @@ def is_trivial_index(index: pandas.Index) -> bool: return True if isinstance(index, pandas.RangeIndex): return index.start == 0 and index.step == 1 - if not (isinstance(index, pandas.Index) and index.dtype == "int64"): + if not ( + isinstance(index, pandas.Index) + and (is_numeric_dtype(index) and not is_float_dtype(index)) + ): return False return ( index.is_monotonic_increasing diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 76af5a110c9..64fb650f10c 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -778,6 +778,7 @@ def test_pivot_table_data(data, index, columns, values, aggfunc, request): [pytest.param("Custom name", id="str_name")], ) @pytest.mark.parametrize("fill_value", [None, 0]) +@pytest.mark.parametrize("backend", [None, "pyarrow"]) def test_pivot_table_margins( data, index, @@ -786,13 +787,14 @@ def test_pivot_table_margins( aggfunc, margins_name, fill_value, + backend, request, ): expected_exception = None if "dict_func" in request.node.callspec.id: expected_exception = KeyError("Column(s) ['col28', 'col38'] do not exist") eval_general( - *create_test_dfs(data), + *create_test_dfs(data, backend=backend), operation=lambda df, *args, **kwargs: df.pivot_table(*args, **kwargs), index=index, columns=columns, diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index b4980118922..ab7a7fa4a31 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -1429,9 +1429,6 @@ def comparator(df1, df2): elif idx == 2: # FIXME: https://github.com/modin-project/modin/issues/7080 expected_exception = False - - if any("pyarrow" in str(dtype) for dtype in pandas_df.dtypes): - pytest.xfail(reason="ValueError(2)") eval_insert( modin_df, pandas_df, @@ -1686,13 +1683,12 @@ def test___neg__(request, data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___invert__(data, request): expected_exception = None - md_df, pd_df = create_test_dfs(data) if "float_nan_data" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7081 expected_exception = False - if any("pyarrow" in str(dtype) for dtype in pd_df.dtypes): - pytest.xfail(reason="pyarrow.lib.ArrowNotImplementedError") - eval_general(md_df, pd_df, lambda df: ~df, expected_exception=expected_exception) + eval_general( + *create_test_dfs(data), lambda df: ~df, expected_exception=expected_exception + ) def test___invert___bool(): diff --git a/modin/tests/pandas/dataframe/test_reduce.py b/modin/tests/pandas/dataframe/test_reduce.py index 2105c165183..74c8285ba04 100644 --- a/modin/tests/pandas/dataframe/test_reduce.py +++ b/modin/tests/pandas/dataframe/test_reduce.py @@ -324,10 +324,9 @@ def test_sum(data, axis, skipna, is_transposed, request): df_equals(modin_result, pandas_result) -@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +@pytest.mark.parametrize("dtype", ["int64", "Int64", "int64[pyarrow]"]) def test_dtype_consistency(dtype): # test for issue #6781 - # TODO: add pyarrow dtype res_dtype = pd.DataFrame([1, 2, 3, 4], dtype=dtype).sum().dtype assert res_dtype == pandas.api.types.pandas_dtype(dtype) From 9d6d8394ae62a1ed520d69dd6d6d9d996ee138e0 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 16:12:59 +0200 Subject: [PATCH 39/50] cleanup Signed-off-by: Anatoly Myachev --- modin/core/dataframe/base/dataframe/utils.py | 7 ++----- modin/core/dataframe/pandas/metadata/dtypes.py | 2 +- modin/core/storage_formats/pandas/aggregations.py | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/modin/core/dataframe/base/dataframe/utils.py b/modin/core/dataframe/base/dataframe/utils.py index 66657ae76cb..7a1478ca5da 100644 --- a/modin/core/dataframe/base/dataframe/utils.py +++ b/modin/core/dataframe/base/dataframe/utils.py @@ -24,7 +24,7 @@ import pandas from pandas._typing import IndexLabel from pandas.api.types import is_scalar -from pandas.core.dtypes.common import is_float_dtype, is_numeric_dtype +from pandas.core.dtypes.common import is_integer_dtype class Axis(Enum): # noqa: PR01 @@ -170,10 +170,7 @@ def is_trivial_index(index: pandas.Index) -> bool: return True if isinstance(index, pandas.RangeIndex): return index.start == 0 and index.step == 1 - if not ( - isinstance(index, pandas.Index) - and (is_numeric_dtype(index) and not is_float_dtype(index)) - ): + if not (isinstance(index, pandas.Index) and is_integer_dtype(index)): return False return ( index.is_monotonic_increasing diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index c7979704db2..ceb205ef74a 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -496,7 +496,7 @@ def _merge_dtypes( # in the 'dtypes_matrix' series = pandas.Series(dtypes, name=i) dtypes_matrix = pandas.concat([dtypes_matrix, series], axis=1) - if val._know_all_names and val._remaining_dtype is None: + if not (val._know_all_names and val._remaining_dtype is None): dtypes_matrix.fillna( value={ # If we encountered a 'NaN' while 'val' describes all the columns, then diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py index fd7d84f49d8..094b202700e 100644 --- a/modin/core/storage_formats/pandas/aggregations.py +++ b/modin/core/storage_formats/pandas/aggregations.py @@ -56,7 +56,7 @@ def corr_method( min_periods: int = 1, numeric_only: bool = True, ) -> PandasQueryCompiler: - if method != "pearson" or qc._modin_frame._pandas_backend == "pyarrow": + if method != "pearson" or qc.get_backend() == "pyarrow": return super(type(qc), qc).corr( method=method, min_periods=min_periods, numeric_only=numeric_only ) From 22f2db62f1ab9121bc80c95bfac8e048ad80c15b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 20:49:50 +0200 Subject: [PATCH 40/50] cleanup Signed-off-by: Anatoly Myachev --- modin/pandas/dataframe.py | 4 ++-- modin/tests/pandas/dataframe/test_binary.py | 19 ++++++++----------- modin/tests/pandas/dataframe/test_reduce.py | 6 ++++++ modin/tests/pandas/utils.py | 1 - 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index fa67d6ad41d..d5b15f5be38 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1623,7 +1623,7 @@ def prod( and numeric_only is False and min_count > len(axis_to_apply) # Type inference is not so simple for pyarrow - and self._query_compiler.get_backend() is not None + and self._query_compiler.get_backend() is None ): new_index = self.columns if not axis else self.index # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10) @@ -2153,7 +2153,7 @@ def sum( and numeric_only is False and min_count > len(axis_to_apply) # Type inference is not so simple for pyarrow - and self._query_compiler.get_backend() is not None + and self._query_compiler.get_backend() is None ): new_index = self.columns if not axis else self.index return Series( diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index 0a8aa80d6d3..9a72cd9d0dc 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -75,7 +75,8 @@ *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), ], ) -def test_math_functions(other, axis, op): +@pytest.mark.parametrize("backend", [None, "pyarrow"]) +def test_math_functions(other, axis, op, backend): data = test_data["float_nan_data"] if (op == "floordiv" or op == "rfloordiv") and axis == "rows": # lambda == "series_or_list" @@ -85,16 +86,12 @@ def test_math_functions(other, axis, op): # lambda == "series_or_list" pytest.xfail(reason="different behavior") - md_df, pd_df = create_test_dfs(data) - if op in ("mod", "rmod") and any("pyarrow" in str(dtype) for dtype in pd_df.dtypes): - with pytest.raises(NotImplementedError): - eval_general( - md_df, pd_df, lambda df: getattr(df, op)(other(df, axis), axis=axis) - ) - else: - eval_general( - md_df, pd_df, lambda df: getattr(df, op)(other(df, axis), axis=axis) - ) + if op in ("mod", "rmod") and backend == "pyarrow": + pytest.skip(reason="Not implemented for pyarrow backend") + eval_general( + *create_test_dfs(data, backend=backend), + lambda df: getattr(df, op)(other(df, axis), axis=axis), + ) @pytest.mark.parametrize("other", [lambda df: 2, lambda df: df]) diff --git a/modin/tests/pandas/dataframe/test_reduce.py b/modin/tests/pandas/dataframe/test_reduce.py index 74c8285ba04..d6f76d68507 100644 --- a/modin/tests/pandas/dataframe/test_reduce.py +++ b/modin/tests/pandas/dataframe/test_reduce.py @@ -355,6 +355,12 @@ def test_sum_prod_specific(fn, min_count, numeric_only): ) +@pytest.mark.parametrize("backend", [None, "pyarrow"]) +def test_sum_prod_min_count(backend): + md_df, pd_df = create_test_dfs(test_data["float_nan_data"], backend=backend) + eval_general(md_df, pd_df, lambda df: df.prod(min_count=len(pd_df) + 1)) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_sum_single_column(data): modin_df = pd.DataFrame(data).iloc[:, [0]] diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py index 1949097ce4f..2dd4346c814 100644 --- a/modin/tests/pandas/utils.py +++ b/modin/tests/pandas/utils.py @@ -662,7 +662,6 @@ def assert_dtypes_equal(df1, df2): lambda obj: isinstance(obj, pandas.PeriodDtype), ) - # `test_pivot_table_margins` failed due to usage ``pd.NA`` in column name for idx in range(len(dtypes1)): for comparator in dtype_comparators: if assert_all_act_same(comparator, dtypes1.iloc[idx], dtypes2.iloc[idx]): From acc20b34d3bedeae52c15fb1c290931cd2167f9e Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2024 00:16:53 +0200 Subject: [PATCH 41/50] cleanup Signed-off-by: Anatoly Myachev --- .../storage_formats/pandas/aggregations.py | 1 + .../storage_formats/pandas/query_compiler.py | 21 ++++++------------- modin/pandas/utils.py | 14 ++++++++++++- modin/tests/pandas/dataframe/test_default.py | 15 ++++++------- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py index 094b202700e..8af6dd40bfb 100644 --- a/modin/core/storage_formats/pandas/aggregations.py +++ b/modin/core/storage_formats/pandas/aggregations.py @@ -56,6 +56,7 @@ def corr_method( min_periods: int = 1, numeric_only: bool = True, ) -> PandasQueryCompiler: + # Further implementation is designed for the default pandas backend (numpy) if method != "pearson" or qc.get_backend() == "pyarrow": return super(type(qc), qc).corr( method=method, min_periods=min_periods, numeric_only=numeric_only diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 358ae635d47..8fb08a969cc 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -39,7 +39,6 @@ is_datetime64_any_dtype, is_list_like, is_numeric_dtype, - is_timedelta64_dtype, ) from pandas.core.groupby.base import transformation_kernels from pandas.core.indexes.api import ensure_index_from_sequences @@ -1855,7 +1854,6 @@ def isin_func(df, values): abs = Map.register(pandas.DataFrame.abs, dtypes="copy") map = Map.register(pandas.DataFrame.map) - # Will it work with pyarrow backend? conj = Map.register(lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df))) def convert_dtypes( @@ -1876,13 +1874,14 @@ def convert_dtypes( convert_floating=convert_floating, dtype_backend=dtype_backend, ) + # TODO: `numpy_nullable` should be handled similar if dtype_backend == "pyarrow": result._modin_frame._pandas_backend = "pyarrow" return result invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy") isna = Map.register(pandas.DataFrame.isna, dtypes=np.bool_) - # better way to distinguish methods for NumPy API? + # TODO: better way to distinguish methods for NumPy API? _isfinite = Map.register( lambda df, *args, **kwargs: pandas.DataFrame(np.isfinite(df, *args, **kwargs)), dtypes=np.bool_, @@ -2272,7 +2271,7 @@ def clip(self, lower, upper, **kwargs): corr = CorrCovBuilder.build_corr_method() def cov(self, min_periods=None, ddof=1): - if self._modin_frame._pandas_backend == "pyarrow": + if self.get_backend() == "pyarrow": return super().cov(min_periods=min_periods, ddof=ddof) # _nancorr use numpy which incompatible with pandas dataframes on pyarrow return self._nancorr(min_periods=min_periods, cov=True, ddof=ddof) @@ -2642,11 +2641,7 @@ def quantile_for_list_of_values(self, **kwargs): new_columns = [ col for col, dtype in zip(self.columns, self.dtypes) - if ( - is_numeric_dtype(dtype) - or is_timedelta64_dtype(dtype) - or is_datetime64_any_dtype(dtype) - ) + if (is_numeric_dtype(dtype) or lib.is_np_dtype(dtype, "mM")) ] if axis == 1: query_compiler = self.getitem_column_array(new_columns) @@ -2841,7 +2836,6 @@ def applyier(df, internal_indices, other=[], internal_other_indices=[]): # __getitem__ methods __getitem_bool = Binary.register( - # TODO: `is_scalar` don't work with pyarrow scalars lambda df, r: df[[r]] if is_scalar(r) else df[r], join_type="left", labels="drop", @@ -4532,20 +4526,17 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item): pandas.DataFrame Partition data with updated values. """ + partition = partition.copy() try: partition.iloc[row_internal_indices, col_internal_indices] = item except ValueError: - # maybe make a copy only if there is an exception? - partition = partition.copy() # `copy` is needed to avoid "ValueError: buffer source array is read-only" for `item` # because the item may be converted to the type that is in the dataframe. # TODO: in the future we will need to convert to the correct type manually according # to the following warning. Example: "FutureWarning: Setting an item of incompatible # dtype is deprecated and will raise in a future error of pandas. Value '[1.38629436]' # has dtype incompatible with int64, please explicitly cast to a compatible dtype first." - partition.iloc[row_internal_indices, col_internal_indices] = ( - item.copy() if hasattr(item, "copy") else item - ) + partition.iloc[row_internal_indices, col_internal_indices] = item.copy() return partition if not is_scalar(item): diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 6039baa2c7b..19e7f4df1c3 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -119,6 +119,19 @@ def is_scalar(obj): def get_pandas_backend(dtypes: pandas.Series) -> str | None: + """ + Determine the backend based on the `dtypes`. + + Parameters + ---------- + dtypes : pandas.Series + DataFrame dtypes. + + Returns + ------- + str | None + Backend name. + """ backend = None if any(isinstance(x, pandas.ArrowDtype) for x in dtypes): backend = "pyarrow" @@ -306,7 +319,6 @@ def broadcast_item( try: # Cast to numpy drop information about heterogeneous types (cast to common) # TODO: we shouldn't do that, maybe there should be the if branch - # TODO: what if item comes from pyarrow item = np.array(item) if dtypes is None: dtypes = pandas.Series([item.dtype] * len(col_lookup)) diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 64fb650f10c..45ab3e2ec95 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -250,15 +250,16 @@ def test_combine_first(): class TestCorr: @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"]) - def test_corr(self, method): + @pytest.mark.parametrize("backend", [None, "pyarrow"]) + def test_corr(self, method, backend): eval_general( - *create_test_dfs(test_data["int_data"]), + *create_test_dfs(test_data["int_data"], backend=backend), lambda df: df.corr(method=method), ) # Modin result may slightly differ from pandas result # due to floating pointing arithmetic. eval_general( - *create_test_dfs(test_data["float_nan_data"]), + *create_test_dfs(test_data["float_nan_data"], backend=backend), lambda df: df.corr(method=method), comparator=modin_df_almost_equals_pandas, ) @@ -352,7 +353,8 @@ def test_corr_nans_in_different_partitions(self): @pytest.mark.parametrize("min_periods", [1, 3, 5], ids=lambda x: f"min_periods={x}") @pytest.mark.parametrize("ddof", [1, 2, 4], ids=lambda x: f"ddof={x}") -def test_cov(min_periods, ddof): +@pytest.mark.parametrize("backend", [None, "pyarrow"]) +def test_cov(min_periods, ddof, backend): # Modin result may slightly differ from pandas result # due to floating pointing arithmetic. if StorageFormat.get() == "Hdk": @@ -366,13 +368,13 @@ def comparator1(df1, df2): comparator2 = modin_df_almost_equals_pandas eval_general( - *create_test_dfs(test_data["int_data"]), + *create_test_dfs(test_data["int_data"], backend=backend), lambda df: df.cov(min_periods=min_periods, ddof=ddof), comparator=comparator1, ) eval_general( - *create_test_dfs(test_data["float_nan_data"]), + *create_test_dfs(test_data["float_nan_data"], backend=backend), lambda df: df.cov(min_periods=min_periods), comparator=comparator2, ) @@ -634,7 +636,6 @@ def test_pivot(data, index, columns, values, request): expected_exception = ValueError( "Index contains duplicate entries, cannot reshape" ) - # failed because pandas doesn't preserve dtype backend eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: df.pivot(*args, **kwargs), From 7a91fc451a141a4e562318de30f4413fb22d8f09 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2024 01:37:13 +0200 Subject: [PATCH 42/50] cleanup Signed-off-by: Anatoly Myachev --- .../dataframe/pandas/dataframe/dataframe.py | 13 ++++++------ .../pandas/partitioning/partition_manager.py | 7 +++---- .../storage_formats/base/query_compiler.py | 20 ++++++++++++++----- .../storage_formats/pandas/query_compiler.py | 4 ---- modin/pandas/base.py | 2 +- modin/utils.py | 3 +++ 6 files changed, 28 insertions(+), 21 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 06accd0884e..273fe8bf22d 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -1200,7 +1200,7 @@ def _take_2d_positional( + f"received: {type(indexer)}", ) if isinstance(indexer, list): - indexer = np.array(indexer, dtype="int64") + indexer = np.array(indexer, dtype=np.int64) indexers.append(indexer) row_positions, col_positions = indexers @@ -1760,7 +1760,6 @@ def astype_builder(df): new_frame = self._partition_mgr_cls.lazy_map_partitions( self._partitions, astype_builder ) - return self.__constructor__( new_frame, self.copy_index_cache(copy_lengths=True), @@ -1881,13 +1880,13 @@ def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False): return dict_of_slices if isinstance(indices, list): # Converting python list to numpy for faster processing - indices = np.array(indices, dtype="int64") + indices = np.array(indices, dtype=np.int64) # Fasttrack empty numpy array if isinstance(indices, np.ndarray) and indices.size == 0: # This will help preserve metadata stored in empty dataframes (indexes and dtypes) # Otherwise, we will get an empty `new_partitions` array, from which it will # no longer be possible to obtain metadata - return dict([(0, np.array([], dtype="int64"))]) + return dict([(0, np.array([], dtype=np.int64))]) negative_mask = np.less(indices, 0) has_negative = np.any(negative_mask) if has_negative: @@ -1895,7 +1894,7 @@ def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False): indices = ( indices.copy() if isinstance(indices, np.ndarray) - else np.array(indices, dtype="int64") + else np.array(indices, dtype=np.int64) ) indices[negative_mask] = indices[negative_mask] % len(self.get_axis(axis)) # If the `indices` array was modified because of the negative indices conversion @@ -4585,7 +4584,7 @@ def from_pandas(cls, df): new_index = df.index new_columns = df.columns new_dtypes = df.dtypes - new_frame, new_lengths, new_widths, pandas_backend = ( + new_frame, pandas_backend, new_lengths, new_widths = ( cls._partition_mgr_cls.from_pandas(df, True) ) return cls( @@ -4613,7 +4612,7 @@ def from_arrow(cls, at): PandasDataframe New Modin DataFrame. """ - new_frame, new_lengths, new_widths, pandas_backend = ( + new_frame, pandas_backend, new_lengths, new_widths = ( cls._partition_mgr_cls.from_arrow(at, return_dims=True) ) new_columns = Index.__new__(Index, data=at.column_names, dtype="O") diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index 0f2e99cfb22..c4a8afa47b2 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -946,7 +946,7 @@ def from_pandas(cls, df, return_dims=False): Returns ------- - np.ndarray or (np.ndarray, row_lengths, col_widths) + (np.ndarray, backend) or (np.ndarray, backend, row_lengths, col_widths) A NumPy array with partitions (with dimensions or not). """ num_splits = NPartitions.get() @@ -1008,7 +1008,7 @@ def update_bar(f): ) for i in range(0, len(df.columns), col_chunksize) ] - return parts, row_lengths, col_widths, backend + return parts, backend, row_lengths, col_widths @classmethod def from_arrow(cls, at, return_dims=False): @@ -1025,10 +1025,9 @@ def from_arrow(cls, at, return_dims=False): Returns ------- - np.ndarray or (np.ndarray, row_lengths, col_widths) + (np.ndarray, backend) or (np.ndarray, backend, row_lengths, col_widths) A NumPy array with partitions (with dimensions or not). """ - # also return backend return cls.from_pandas(at.to_pandas(), return_dims=return_dims) @classmethod diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index b4bd72ce0e0..662ce2c9dd1 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -1116,7 +1116,7 @@ def merge_asof( tolerance=None, allow_exact_matches: bool = True, direction: str = "backward", - ): + ): # noqa: GL08 # Pandas fallbacks for tricky cases: if ( # No idea how this works or why it does what it does; and in fact @@ -3620,7 +3620,9 @@ def groupby_fillna( drop=drop, ) - def groupby_diff(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False): + def groupby_diff( + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False + ): # noqa: GL08 return self.groupby_agg( by=by, agg_func="diff", @@ -3633,7 +3635,7 @@ def groupby_diff(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=Fals def groupby_pct_change( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False - ): + ): # noqa: GL08 return self.groupby_agg( by=by, agg_func="pct_change", @@ -3941,7 +3943,7 @@ def groupby_ohlc( agg_args, agg_kwargs, is_df, - ): + ): # noqa: GL08 if not is_df: return self.groupby_agg( by=by, @@ -4605,7 +4607,7 @@ def shift( freq, axis, fill_value, - ): + ): # noqa: GL08 return DataFrameDefault.register(pandas.DataFrame.shift)( self, periods, freq, axis, fill_value ) @@ -6756,6 +6758,14 @@ def case_when(self, caselist): # noqa: PR01, RT01, D200 return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist) def get_backend(self) -> Optional[str]: + """ + Get backend stored in `_modin_frame`. + + Returns + ------- + str | None + Backend name. + """ return self._modin_frame._pandas_backend def repartition(self, axis=None): diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 8fb08a969cc..f8a0964079d 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -4602,10 +4602,6 @@ def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): def cat_codes(self): def func(df: pandas.DataFrame) -> pandas.DataFrame: ser = df.iloc[:, 0] - if not isinstance(ser.dtype, pandas.CategoricalDtype): - raise TypeError( - f"Series dtype should be `CategoricalDtype`: actual dtype: {ser.dtype}" - ) return ser.cat.codes.to_frame(name=MODIN_UNNAMED_SERIES_LABEL) res = self._modin_frame.map(func=func, new_columns=[MODIN_UNNAMED_SERIES_LABEL]) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 16cdc1bdec2..746ed2ec9fa 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -364,7 +364,7 @@ def _validate_other( other_dtypes = [other.dtype] * len(other) elif is_dict_like(other): other_dtypes = [ - type(other[label]) + other[label] if pandas.isna(other[label]) else type(other[label]) for label in self._get_axis(axis) # The binary operation is applied for intersection of axis labels # and dictionary keys. So filtering out extra keys. diff --git a/modin/utils.py b/modin/utils.py index a3ed1dc91a3..ee72620fa7b 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -310,6 +310,9 @@ def _replace_doc( target_doc = target_obj.__doc__ or "" overwrite = overwrite or not target_doc doc = source_doc if overwrite else target_doc + if doc == "": + # Empty docstrings do not need to be inherited + return if parent_cls and not attr_name: if isinstance(target_obj, property): From d31e93f497844c1b38a8478088a29f4644d4307a Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2024 01:38:54 +0200 Subject: [PATCH 43/50] revert changes in metadata/dtypes.py Signed-off-by: Anatoly Myachev --- modin/core/dataframe/pandas/metadata/dtypes.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index ceb205ef74a..96f06aa757f 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -528,11 +528,7 @@ def _merge_dtypes( def combine_dtypes(row): if (row == "unknown").any(): return "unknown" - if any("pyarrow" in str(x) for x in row): - # nans can be stored not only in float types, for example in `bool[pyarrow]` - row = row[~row.isna()] - else: - row = row.fillna(pandas.api.types.pandas_dtype("float")) + row = row.fillna(pandas.api.types.pandas_dtype("float")) return find_common_type(list(row.values)) dtypes = dtypes_matrix.apply(combine_dtypes, axis=1) From b3179fcd9d0313e85e51c8a92853f788b959897b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2024 01:45:18 +0200 Subject: [PATCH 44/50] fix tests Signed-off-by: Anatoly Myachev --- modin/core/dataframe/pandas/dataframe/dataframe.py | 2 ++ modin/tests/pandas/test_series.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 23bd9f4428a..bbf3af12ef9 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -101,6 +101,8 @@ class PandasDataframe( each of the block partitions. Is computed if not provided. dtypes : pandas.Series or callable, optional The data types for the dataframe columns. + pandas_backend : {"pyarrow", None}, optional + Backend used by pandas. """ _partition_mgr_cls: PandasDataframePartitionManager diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index dd8dbe85da9..4c45102ede5 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1419,12 +1419,14 @@ def comparator(df1, df2): comparator=comparator, ) - eval_general( - modin_series, - pandas_series, - lambda ser: ser > (ser + 1), - comparator=comparator, - ) + if StorageFormat.get() != "Hdk": + # FIXME: HDK should also work in this case + eval_general( + modin_series, + pandas_series, + lambda ser: ser > (ser + 1), + comparator=comparator, + ) eval_general( modin_series, From b3471ff7e921454512381f9c9eb267a2777c04dc Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2024 13:05:05 +0200 Subject: [PATCH 45/50] cleanup Signed-off-by: Anatoly Myachev --- .../pandas_on_dask/dataframe/dataframe.py | 2 ++ .../pandas_on_python/dataframe/dataframe.py | 2 ++ .../pandas_on_ray/dataframe/dataframe.py | 2 ++ .../pandas_on_unidist/dataframe/dataframe.py | 2 ++ modin/core/storage_formats/base/query_compiler.py | 12 +++++------- modin/utils.py | 3 --- 6 files changed, 13 insertions(+), 10 deletions(-) diff --git a/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py b/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py index 0920d963840..5e4598d0ddf 100644 --- a/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py +++ b/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py @@ -38,6 +38,8 @@ class PandasOnDaskDataframe(PandasDataframe): each of the block partitions. Is computed if not provided. dtypes : pandas.Series, optional The data types for the dataframe columns. + pandas_backend : {"pyarrow", None}, optional + Backend used by pandas. None - means default NumPy backend. """ _partition_mgr_cls = PandasOnDaskDataframePartitionManager diff --git a/modin/core/execution/python/implementations/pandas_on_python/dataframe/dataframe.py b/modin/core/execution/python/implementations/pandas_on_python/dataframe/dataframe.py index 6e314beaa9c..0e2bc70d995 100644 --- a/modin/core/execution/python/implementations/pandas_on_python/dataframe/dataframe.py +++ b/modin/core/execution/python/implementations/pandas_on_python/dataframe/dataframe.py @@ -45,6 +45,8 @@ class PandasOnPythonDataframe(PandasDataframe): each of the block partitions. Is computed if not provided. dtypes : pandas.Series, optional The data types for the dataframe columns. + pandas_backend : {"pyarrow", None}, optional + Backend used by pandas. None - means default NumPy backend. """ _partition_mgr_cls = PandasOnPythonDataframePartitionManager diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/dataframe/dataframe.py b/modin/core/execution/ray/implementations/pandas_on_ray/dataframe/dataframe.py index 6838fd9edca..373a84ecdb4 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/dataframe/dataframe.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/dataframe/dataframe.py @@ -39,6 +39,8 @@ class PandasOnRayDataframe(PandasDataframe): each of the block partitions. Is computed if not provided. dtypes : pandas.Series, optional The data types for the dataframe columns. + pandas_backend : {"pyarrow", None}, optional + Backend used by pandas. None - means default NumPy backend. """ _partition_mgr_cls = PandasOnRayDataframePartitionManager diff --git a/modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/dataframe.py b/modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/dataframe.py index 3241e9299e8..9adba6bc6dc 100644 --- a/modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/dataframe.py +++ b/modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/dataframe.py @@ -38,6 +38,8 @@ class PandasOnUnidistDataframe(PandasDataframe): each of the block partitions. Is computed if not provided. dtypes : pandas.Series, optional The data types for the dataframe columns. + pandas_backend : {"pyarrow", None}, optional + Backend used by pandas. None - means default NumPy backend. """ _partition_mgr_cls = PandasOnUnidistDataframePartitionManager diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index df21d021b82..25c38929014 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -1118,7 +1118,7 @@ def merge_asof( tolerance=None, allow_exact_matches: bool = True, direction: str = "backward", - ): # noqa: GL08 + ): # Pandas fallbacks for tricky cases: if ( # No idea how this works or why it does what it does; and in fact @@ -3622,9 +3622,7 @@ def groupby_fillna( drop=drop, ) - def groupby_diff( - self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False - ): # noqa: GL08 + def groupby_diff(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False): return self.groupby_agg( by=by, agg_func="diff", @@ -3637,7 +3635,7 @@ def groupby_diff( def groupby_pct_change( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False - ): # noqa: GL08 + ): return self.groupby_agg( by=by, agg_func="pct_change", @@ -3945,7 +3943,7 @@ def groupby_ohlc( agg_args, agg_kwargs, is_df, - ): # noqa: GL08 + ): if not is_df: return self.groupby_agg( by=by, @@ -4609,7 +4607,7 @@ def shift( freq, axis, fill_value, - ): # noqa: GL08 + ): return DataFrameDefault.register(pandas.DataFrame.shift)( self, periods, freq, axis, fill_value ) diff --git a/modin/utils.py b/modin/utils.py index ee72620fa7b..a3ed1dc91a3 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -310,9 +310,6 @@ def _replace_doc( target_doc = target_obj.__doc__ or "" overwrite = overwrite or not target_doc doc = source_doc if overwrite else target_doc - if doc == "": - # Empty docstrings do not need to be inherited - return if parent_cls and not attr_name: if isinstance(target_obj, property): From 14b4dd3274d30e8bfb4fb4894c512f381f3db6db Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2024 13:22:28 +0200 Subject: [PATCH 46/50] fix Signed-off-by: Anatoly Myachev --- .../ray/implementations/cudf_on_ray/dataframe/dataframe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modin/core/execution/ray/implementations/cudf_on_ray/dataframe/dataframe.py b/modin/core/execution/ray/implementations/cudf_on_ray/dataframe/dataframe.py index a40d10cd4c1..b4fef2ed18a 100644 --- a/modin/core/execution/ray/implementations/cudf_on_ray/dataframe/dataframe.py +++ b/modin/core/execution/ray/implementations/cudf_on_ray/dataframe/dataframe.py @@ -50,6 +50,8 @@ class cuDFOnRayDataframe(PandasOnRayDataframe): each of the block partitions. Is computed if not provided. dtypes : pandas.Series, optional The data types for the dataframe columns. + pandas_backend : {"pyarrow", None}, optional + Backend used by pandas. None - means default NumPy backend. """ _partition_mgr_cls = cuDFOnRayDataframePartitionManager From 7abfc427bf359c98c87ea94883af2f5a8f13f42b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2024 15:54:20 +0200 Subject: [PATCH 47/50] Apply suggestions from code review Co-authored-by: Iaroslav Igoshev --- modin/core/dataframe/pandas/dataframe/dataframe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 0d3cffdbd59..cd0e75ed861 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -143,13 +143,13 @@ def __init__( self._row_lengths_cache = row_lengths self._column_widths_cache = column_widths self._pandas_backend = pandas_backend - if not pandas_backend == "pyarrow": + if pandas_backend != "pyarrow": + self.set_dtypes_cache(dtypes) + else: # In this case, the type precomputation may be incorrect; we need # to know the type algebra precisely. Considering the number of operations # and different combinations of backends, the best solution would be to # introduce optimizations gradually, with a large number of tests. - self.set_dtypes_cache(dtypes) - else: self.set_dtypes_cache(None) self._validate_axes_lengths() From 30d47494f6d5108a32e1ea77523fd7138f0274fb Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2024 15:56:45 +0200 Subject: [PATCH 48/50] Apply suggestions from code review Co-authored-by: Iaroslav Igoshev --- modin/tests/pandas/test_series.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index 4c45102ede5..be737e4c70d 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1420,7 +1420,8 @@ def comparator(df1, df2): ) if StorageFormat.get() != "Hdk": - # FIXME: HDK should also work in this case + # FIXME: HDK should also work in this case but + # since we deprecated it, we will just remove this branch eval_general( modin_series, pandas_series, From 3213194b2733f7efe74225a5a0bf671b7aa4ef15 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2024 16:13:34 +0200 Subject: [PATCH 49/50] address review comments Signed-off-by: Anatoly Myachev --- modin/core/storage_formats/base/query_compiler.py | 2 +- modin/core/storage_formats/pandas/aggregations.py | 2 +- modin/core/storage_formats/pandas/query_compiler.py | 2 +- modin/pandas/dataframe.py | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 25c38929014..50008f261a4 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -6757,7 +6757,7 @@ def case_when(self, caselist): # noqa: PR01, RT01, D200 ] return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist) - def get_backend(self) -> Optional[str]: + def get_pandas_backend(self) -> Optional[str]: """ Get backend stored in `_modin_frame`. diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py index 8af6dd40bfb..3959b86f3ab 100644 --- a/modin/core/storage_formats/pandas/aggregations.py +++ b/modin/core/storage_formats/pandas/aggregations.py @@ -57,7 +57,7 @@ def corr_method( numeric_only: bool = True, ) -> PandasQueryCompiler: # Further implementation is designed for the default pandas backend (numpy) - if method != "pearson" or qc.get_backend() == "pyarrow": + if method != "pearson" or qc.get_pandas_backend() == "pyarrow": return super(type(qc), qc).corr( method=method, min_periods=min_periods, numeric_only=numeric_only ) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 4f55159aa60..0014f4992ef 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2284,7 +2284,7 @@ def clip(self, lower, upper, **kwargs): corr = CorrCovBuilder.build_corr_method() def cov(self, min_periods=None, ddof=1): - if self.get_backend() == "pyarrow": + if self.get_pandas_backend() == "pyarrow": return super().cov(min_periods=min_periods, ddof=ddof) # _nancorr use numpy which incompatible with pandas dataframes on pyarrow return self._nancorr(min_periods=min_periods, cov=True, ddof=ddof) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index d5b15f5be38..7cbfc2634d9 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1622,8 +1622,8 @@ def prod( skipna is not False and numeric_only is False and min_count > len(axis_to_apply) - # Type inference is not so simple for pyarrow - and self._query_compiler.get_backend() is None + # This fast path is only suitable for the default backend + and self._query_compiler.get_pandas_backend() is None ): new_index = self.columns if not axis else self.index # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10) @@ -2152,8 +2152,8 @@ def sum( skipna is not False and numeric_only is False and min_count > len(axis_to_apply) - # Type inference is not so simple for pyarrow - and self._query_compiler.get_backend() is None + # This fast path is only suitable for the default backend + and self._query_compiler.get_pandas_backend() is None ): new_index = self.columns if not axis else self.index return Series( From 45acef9b2b6f1c3f2f9611db16549350ee30ffaf Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2024 19:56:08 +0200 Subject: [PATCH 50/50] expand comments Signed-off-by: Anatoly Myachev --- modin/core/dataframe/pandas/dataframe/dataframe.py | 4 ++++ modin/core/storage_formats/pandas/groupby.py | 1 + modin/tests/pandas/dataframe/test_binary.py | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index cd0e75ed861..04952c00f45 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -3985,6 +3985,7 @@ def _compute_new_widths(): new_columns = joined_index frames = [self] + others # TODO: should we wrap all `concat` call into "try except" block? + # `ModinDtypes.concat` can throw exception in case of duplicate values new_dtypes = ModinDtypes.concat([frame._dtypes for frame in frames], axis=1) # If we have already cached the length of each row in at least one # of the row's partitions, we can build new_lengths for the new @@ -4621,6 +4622,9 @@ def _arrow_type_to_dtype(cls, arrow_type): try: # TODO: should we map arrow types to pyarrow-backed pandas types? + # It seems like this might help avoid the expense of transferring + # data between backends (numpy and pyarrow), but we need to be sure + # how this fits into the type inference system in pandas. res = arrow_type.to_pandas_dtype() # Conversion to pandas is not implemented for some arrow types, # perform manual conversion for them: diff --git a/modin/core/storage_formats/pandas/groupby.py b/modin/core/storage_formats/pandas/groupby.py index 4b22b5c0158..55de645a898 100644 --- a/modin/core/storage_formats/pandas/groupby.py +++ b/modin/core/storage_formats/pandas/groupby.py @@ -360,6 +360,7 @@ def applyier(df, other): # pragma: no cover # different partitions if len(index) == 0 and len(columns) > 0: common_type = find_common_type(result.dtypes.tolist()) + # TODO: remove find_common_type+astype after pandas fix the following issue # transpose loses dtypes: https://github.com/pandas-dev/pandas/issues/43337 result = result.transpose().astype(common_type, copy=False) diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index 9a72cd9d0dc..10dabbe32bc 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -87,7 +87,7 @@ def test_math_functions(other, axis, op, backend): pytest.xfail(reason="different behavior") if op in ("mod", "rmod") and backend == "pyarrow": - pytest.skip(reason="Not implemented for pyarrow backend") + pytest.skip(reason="These functions are not implemented in pandas itself") eval_general( *create_test_dfs(data, backend=backend), lambda df: getattr(df, op)(other(df, axis), axis=axis),