From 8e46e4e47a11d37cb7b2b49079307b5a49c57488 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 11 Mar 2024 16:31:25 +0100
Subject: [PATCH 01/50] TEST-#7049: Add some sanity tests with pyarrow-backed
 pandas dataframes

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/algebra/binary.py |  1 +
 modin/tests/pandas/test_series.py      | 88 ++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py
index b2ffc2e6788..75474bb3f08 100644
--- a/modin/core/dataframe/algebra/binary.py
+++ b/modin/core/dataframe/algebra/binary.py
@@ -250,6 +250,7 @@ def try_compute_new_dtypes(
 
     try:
         if infer_dtypes == "bool" or is_bool_dtype(result_dtype):
+            # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data
             dtypes = maybe_build_dtypes_series(
                 first, second, dtype=pandas.api.types.pandas_dtype(bool)
             )
diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py
index b079ce586dd..3530a2268d9 100644
--- a/modin/tests/pandas/test_series.py
+++ b/modin/tests/pandas/test_series.py
@@ -1387,6 +1387,94 @@ def test_constructor_arrow_extension_array():
     df_equals(md_ser.dtypes, pd_ser.dtypes)
 
 
+def test_pyarrow_constructor():
+    pa = pytest.importorskip("pyarrow")
+    data = list("abcd")
+    _ = pd.Series(data, dtype="string[pyarrow]")
+    _ = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
+
+    list_str_type = pa.list_(pa.string())
+    _ = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))
+
+    from datetime import time
+
+    _ = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))
+
+    from decimal import Decimal
+
+    decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))
+
+    data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]
+
+    _ = pd.DataFrame(data, dtype=decimal_type)
+
+
+def test_pyarrow_array_retrieve():
+    pa = pytest.importorskip("pyarrow")
+    modin_series, pandas_series = create_test_series(
+        [1, 2, None], dtype="uint8[pyarrow]"
+    )
+    eval_general(
+        modin_series,
+        pandas_series,
+        lambda ser: pa.array(ser),
+        raising_exceptions=(Exception,),
+    )
+
+
+def test_pyarrow_functions():
+    pytest.importorskip("pyarrow")
+    modin_series, pandas_series = create_test_series(
+        [-1.545, 0.211, None], dtype="float32[pyarrow]"
+    )
+    df_equals(modin_series.mean(), pandas_series.mean())
+
+    def comparator(df1, df2):
+        df_equals(df1, df2)
+        df_equals(df1.dtypes, df2.dtypes)
+
+    eval_general(
+        modin_series,
+        pandas_series,
+        lambda ser: ser
+        + (modin_series if isinstance(ser, pd.Series) else pandas_series),
+        comparator=comparator,
+        raising_exceptions=(Exception,),
+    )
+
+    eval_general(
+        modin_series,
+        pandas_series,
+        lambda ser: ser > (ser + 1),
+        comparator=comparator,
+        raising_exceptions=(Exception,),
+    )
+
+    eval_general(
+        modin_series,
+        pandas_series,
+        lambda ser: ser.dropna(),
+        comparator=comparator,
+        raising_exceptions=(Exception,),
+    )
+
+    eval_general(
+        modin_series,
+        pandas_series,
+        lambda ser: ser.isna(),
+        comparator=comparator,
+        raising_exceptions=(Exception,),
+    )
+
+    eval_general(
+        modin_series,
+        pandas_series,
+        lambda ser: ser.fillna(0),
+        comparator=comparator,
+        raising_exceptions=(Exception,),
+    )
+
+
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_copy(data):
     modin_series, pandas_series = create_test_series(data)

From 6814c6eddee2b1004d8b8f24495e8c410cb16c7c Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 2 Apr 2024 15:59:51 +0200
Subject: [PATCH 02/50] fixes

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/tests/pandas/test_series.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py
index 3530a2268d9..36844772ada 100644
--- a/modin/tests/pandas/test_series.py
+++ b/modin/tests/pandas/test_series.py
@@ -1418,7 +1418,6 @@ def test_pyarrow_array_retrieve():
         modin_series,
         pandas_series,
         lambda ser: pa.array(ser),
-        raising_exceptions=(Exception,),
     )
 
 
@@ -1439,7 +1438,6 @@ def comparator(df1, df2):
         lambda ser: ser
         + (modin_series if isinstance(ser, pd.Series) else pandas_series),
         comparator=comparator,
-        raising_exceptions=(Exception,),
     )
 
     eval_general(
@@ -1447,7 +1445,6 @@ def comparator(df1, df2):
         pandas_series,
         lambda ser: ser > (ser + 1),
         comparator=comparator,
-        raising_exceptions=(Exception,),
     )
 
     eval_general(
@@ -1455,7 +1452,6 @@ def comparator(df1, df2):
         pandas_series,
         lambda ser: ser.dropna(),
         comparator=comparator,
-        raising_exceptions=(Exception,),
     )
 
     eval_general(
@@ -1463,7 +1459,6 @@ def comparator(df1, df2):
         pandas_series,
         lambda ser: ser.isna(),
         comparator=comparator,
-        raising_exceptions=(Exception,),
     )
 
     eval_general(
@@ -1471,7 +1466,6 @@ def comparator(df1, df2):
         pandas_series,
         lambda ser: ser.fillna(0),
         comparator=comparator,
-        raising_exceptions=(Exception,),
     )
 
 

From e1dbc69f0572a0db2fc41d37bd574ab786326672 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 19 Apr 2024 12:50:02 +0200
Subject: [PATCH 03/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/algebra/binary.py |  2 +-
 modin/tests/pandas/test_series.py      | 12 ------------
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py
index 75474bb3f08..a5fc89dd573 100644
--- a/modin/core/dataframe/algebra/binary.py
+++ b/modin/core/dataframe/algebra/binary.py
@@ -250,7 +250,7 @@ def try_compute_new_dtypes(
 
     try:
         if infer_dtypes == "bool" or is_bool_dtype(result_dtype):
-            # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data
+            # FIXME: can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data
             dtypes = maybe_build_dtypes_series(
                 first, second, dtype=pandas.api.types.pandas_dtype(bool)
             )
diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py
index 36844772ada..8b4ec960de2 100644
--- a/modin/tests/pandas/test_series.py
+++ b/modin/tests/pandas/test_series.py
@@ -1409,18 +1409,6 @@ def test_pyarrow_constructor():
     _ = pd.DataFrame(data, dtype=decimal_type)
 
 
-def test_pyarrow_array_retrieve():
-    pa = pytest.importorskip("pyarrow")
-    modin_series, pandas_series = create_test_series(
-        [1, 2, None], dtype="uint8[pyarrow]"
-    )
-    eval_general(
-        modin_series,
-        pandas_series,
-        lambda ser: pa.array(ser),
-    )
-
-
 def test_pyarrow_functions():
     pytest.importorskip("pyarrow")
     modin_series, pandas_series = create_test_series(

From 7b925a50c5f85ff8df8deae41b76295b56946beb Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 19 Apr 2024 13:15:33 +0200
Subject: [PATCH 04/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../pandas/dataframe/test_map_metadata.py     |  9 ++++++
 modin/tests/pandas/test_series.py             | 32 +++++++------------
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py
index b6dc1686ff8..ab7a7fa4a31 100644
--- a/modin/tests/pandas/dataframe/test_map_metadata.py
+++ b/modin/tests/pandas/dataframe/test_map_metadata.py
@@ -11,6 +11,8 @@
 # ANY KIND, either express or implied. See the License for the specific language
 # governing permissions and limitations under the License.
 
+from decimal import Decimal
+
 import matplotlib
 import numpy as np
 import pandas
@@ -1797,6 +1799,13 @@ def test_constructor(data):
     df_equals(pandas_df, modin_df)
 
 
+def test_pyarrow_constructor():
+    pa = pytest.importorskip("pyarrow")
+
+    data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]
+    df_equals(*create_test_dfs(data, dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))))
+
+
 @pytest.mark.parametrize(
     "data",
     [
diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py
index 0d17823bf61..e5ffad9a7ee 100644
--- a/modin/tests/pandas/test_series.py
+++ b/modin/tests/pandas/test_series.py
@@ -1390,23 +1390,12 @@ def test_constructor_arrow_extension_array():
 def test_pyarrow_constructor():
     pa = pytest.importorskip("pyarrow")
     data = list("abcd")
-    _ = pd.Series(data, dtype="string[pyarrow]")
-    _ = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
+    df_equals(*create_test_series(data, dtype="string[pyarrow]"))
+    df_equals(*create_test_series(data, dtype=pd.ArrowDtype(pa.string())))
 
+    data = [["hello"], ["there"]]
     list_str_type = pa.list_(pa.string())
-    _ = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))
-
-    from datetime import time
-
-    _ = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))
-
-    from decimal import Decimal
-
-    decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))
-
-    data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]
-
-    _ = pd.DataFrame(data, dtype=decimal_type)
+    df_equals(*create_test_series(data, dtype=pd.ArrowDtype(list_str_type)))
 
 
 def test_pyarrow_functions():
@@ -1428,12 +1417,13 @@ def comparator(df1, df2):
         comparator=comparator,
     )
 
-    eval_general(
-        modin_series,
-        pandas_series,
-        lambda ser: ser > (ser + 1),
-        comparator=comparator,
-    )
+    # FIXME: https://github.com/modin-project/modin/issues/7203
+    # eval_general(
+    #    modin_series,
+    #    pandas_series,
+    #    lambda ser: ser > (ser + 1),
+    #    comparator=comparator,
+    # )
 
     eval_general(
         modin_series,

From 23003c580487971ae5ec82d8b9a7bfb58825f76b Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 19 Apr 2024 13:18:01 +0200
Subject: [PATCH 05/50] fix comment

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/algebra/binary.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py
index a5fc89dd573..6af31ab826c 100644
--- a/modin/core/dataframe/algebra/binary.py
+++ b/modin/core/dataframe/algebra/binary.py
@@ -250,7 +250,8 @@ def try_compute_new_dtypes(
 
     try:
         if infer_dtypes == "bool" or is_bool_dtype(result_dtype):
-            # FIXME: can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data
+            # FIXME: https://github.com/modin-project/modin/issues/7203
+            # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data
             dtypes = maybe_build_dtypes_series(
                 first, second, dtype=pandas.api.types.pandas_dtype(bool)
             )

From cc2a5ab8a4c14a2bd736cc390b4dc15b1003e328 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 19 Apr 2024 14:06:55 +0200
Subject: [PATCH 06/50] skip some cases for HDK

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/tests/pandas/test_series.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py
index e5ffad9a7ee..fe3e3bc2e33 100644
--- a/modin/tests/pandas/test_series.py
+++ b/modin/tests/pandas/test_series.py
@@ -1409,13 +1409,15 @@ def comparator(df1, df2):
         df_equals(df1, df2)
         df_equals(df1.dtypes, df2.dtypes)
 
-    eval_general(
-        modin_series,
-        pandas_series,
-        lambda ser: ser
-        + (modin_series if isinstance(ser, pd.Series) else pandas_series),
-        comparator=comparator,
-    )
+    if StorageFormat.get() != "Hdk":
+        # FIXME: HDK should also work in this case
+        eval_general(
+            modin_series,
+            pandas_series,
+            lambda ser: ser
+            + (modin_series if isinstance(ser, pd.Series) else pandas_series),
+            comparator=comparator,
+        )
 
     # FIXME: https://github.com/modin-project/modin/issues/7203
     # eval_general(
@@ -1439,12 +1441,14 @@ def comparator(df1, df2):
         comparator=comparator,
     )
 
-    eval_general(
-        modin_series,
-        pandas_series,
-        lambda ser: ser.fillna(0),
-        comparator=comparator,
-    )
+    if StorageFormat.get() != "Hdk":
+        # FIXME: HDK should also work in this case
+        eval_general(
+            modin_series,
+            pandas_series,
+            lambda ser: ser.fillna(0),
+            comparator=comparator,
+        )
 
 
 def test_pyarrow_array_retrieve():

From b710865e9dbd09fdbb30772cf76110e2e00467df Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 19 Apr 2024 14:42:14 +0200
Subject: [PATCH 07/50] FEAT-#7203: Make sure modin works correctly with
 pandas, which uses pyarrow as a backend

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../hdk_on_native/dataframe/utils.py             |  6 ++----
 modin/tests/pandas/test_series.py                | 16 +++++++---------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py
index f99cc256baa..4f749cf0e3b 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py
@@ -23,7 +23,7 @@
 import pandas
 import pyarrow as pa
 from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
-from pandas.core.dtypes.common import _get_dtype, is_string_dtype
+from pandas.core.dtypes.common import _get_dtype
 from pyarrow.types import is_dictionary
 
 from modin.pandas.indexing import is_range_like
@@ -504,9 +504,7 @@ def to_arrow_type(dtype) -> pa.lib.DataType:
     -------
     pa.lib.DataType
     """
-    if is_string_dtype(dtype):
-        return pa.from_numpy_dtype(str)
-    return pa.from_numpy_dtype(dtype)
+    return pandas.api.types.pandas_dtype(dtype).pyarrow_dtype
 
 
 def get_common_arrow_type(t1: pa.lib.DataType, t2: pa.lib.DataType) -> pa.lib.DataType:
diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py
index fe3e3bc2e33..89722a62e6e 100644
--- a/modin/tests/pandas/test_series.py
+++ b/modin/tests/pandas/test_series.py
@@ -1409,15 +1409,13 @@ def comparator(df1, df2):
         df_equals(df1, df2)
         df_equals(df1.dtypes, df2.dtypes)
 
-    if StorageFormat.get() != "Hdk":
-        # FIXME: HDK should also work in this case
-        eval_general(
-            modin_series,
-            pandas_series,
-            lambda ser: ser
-            + (modin_series if isinstance(ser, pd.Series) else pandas_series),
-            comparator=comparator,
-        )
+    eval_general(
+        modin_series,
+        pandas_series,
+        lambda ser: ser
+        + (modin_series if isinstance(ser, pd.Series) else pandas_series),
+        comparator=comparator,
+    )
 
     # FIXME: https://github.com/modin-project/modin/issues/7203
     # eval_general(

From 310f12a7c6a45c4d02b8388df3990d03ccb3f015 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 19 Apr 2024 15:22:24 +0200
Subject: [PATCH 08/50] don't use numpy types directly

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/base/dataframe/utils.py  |   3 +-
 .../dataframe/pandas/dataframe/dataframe.py   |   8 +-
 .../storage_formats/pandas/query_compiler.py  | 122 +++++++++---------
 .../hdk_on_native/calcite_serializer.py       |   1 +
 modin/numpy/indexing.py                       |   4 +-
 modin/pandas/base.py                          |  12 +-
 modin/pandas/indexing.py                      |   4 +-
 7 files changed, 76 insertions(+), 78 deletions(-)

diff --git a/modin/core/dataframe/base/dataframe/utils.py b/modin/core/dataframe/base/dataframe/utils.py
index adc159a1a0f..c8e3f742193 100644
--- a/modin/core/dataframe/base/dataframe/utils.py
+++ b/modin/core/dataframe/base/dataframe/utils.py
@@ -21,7 +21,6 @@
 from enum import Enum
 from typing import Dict, List, Sequence, Tuple, cast
 
-import numpy as np
 import pandas
 from pandas._typing import IndexLabel
 from pandas.api.types import is_scalar
@@ -170,7 +169,7 @@ def is_trivial_index(index: pandas.Index) -> bool:
         return True
     if isinstance(index, pandas.RangeIndex):
         return index.start == 0 and index.step == 1
-    if not (isinstance(index, pandas.Index) and index.dtype == np.int64):
+    if not (isinstance(index, pandas.Index) and index.dtype == "int64"):
         return False
     return (
         index.is_monotonic_increasing
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 6539e4d286f..bf13942b594 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -1175,7 +1175,7 @@ def _take_2d_positional(
                     + f"received: {type(indexer)}",
                 )
                 if isinstance(indexer, list):
-                    indexer = np.array(indexer, dtype=np.int64)
+                    indexer = np.array(indexer, dtype="int64")
             indexers.append(indexer)
         row_positions, col_positions = indexers
 
@@ -1836,13 +1836,13 @@ def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False):
                         return dict_of_slices
         if isinstance(indices, list):
             # Converting python list to numpy for faster processing
-            indices = np.array(indices, dtype=np.int64)
+            indices = np.array(indices, dtype="int64")
         # Fasttrack empty numpy array
         if isinstance(indices, np.ndarray) and indices.size == 0:
             # This will help preserve metadata stored in empty dataframes (indexes and dtypes)
             # Otherwise, we will get an empty `new_partitions` array, from which it will
             #  no longer be possible to obtain metadata
-            return dict([(0, np.array([], dtype=np.int64))])
+            return dict([(0, np.array([], dtype="int64"))])
         negative_mask = np.less(indices, 0)
         has_negative = np.any(negative_mask)
         if has_negative:
@@ -1850,7 +1850,7 @@ def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False):
             indices = (
                 indices.copy()
                 if isinstance(indices, np.ndarray)
-                else np.array(indices, dtype=np.int64)
+                else np.array(indices, dtype="int64")
             )
             indices[negative_mask] = indices[negative_mask] % len(self.get_axis(axis))
         # If the `indices` array was modified because of the negative indices conversion
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 7fd07d5fe91..ecdd0704129 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -1840,7 +1840,7 @@ def isin_func(df, values):
                 )
             return res
 
-        return Map.register(isin_func, shape_hint=shape_hint, dtypes=np.bool_)(
+        return Map.register(isin_func, shape_hint=shape_hint, dtypes="bool")(
             self, values
         )
 
@@ -1849,7 +1849,7 @@ def isin_func(df, values):
     conj = Map.register(lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df)))
     convert_dtypes = Fold.register(pandas.DataFrame.convert_dtypes)
     invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy")
-    isna = Map.register(pandas.DataFrame.isna, dtypes=np.bool_)
+    isna = Map.register(pandas.DataFrame.isna, dtypes="bool")
     _isfinite = Map.register(
         lambda df, *args, **kwargs: pandas.DataFrame(np.isfinite(df, *args, **kwargs)),
         dtypes=np.bool_,
@@ -1889,7 +1889,7 @@ def isin_func(df, values):
         lambda df, *args, **kwargs: pandas.DataFrame(np.exp(df, *args, **kwargs))
     )  # Needed for numpy API
     negative = Map.register(pandas.DataFrame.__neg__)
-    notna = Map.register(pandas.DataFrame.notna, dtypes=np.bool_)
+    notna = Map.register(pandas.DataFrame.notna, dtypes="bool")
     round = Map.register(pandas.DataFrame.round)
     replace = Map.register(pandas.DataFrame.replace)
     series_view = Map.register(
@@ -1915,24 +1915,24 @@ def isin_func(df, values):
 
     str_capitalize = Map.register(_str_map("capitalize"), dtypes="copy")
     str_center = Map.register(_str_map("center"), dtypes="copy")
-    str_contains = Map.register(_str_map("contains"), dtypes=np.bool_)
-    str_count = Map.register(_str_map("count"), dtypes=int)
-    str_endswith = Map.register(_str_map("endswith"), dtypes=np.bool_)
-    str_find = Map.register(_str_map("find"), dtypes=np.int64)
+    str_contains = Map.register(_str_map("contains"), dtypes="bool")
+    str_count = Map.register(_str_map("count"), dtypes="int64")
+    str_endswith = Map.register(_str_map("endswith"), dtypes="bool")
+    str_find = Map.register(_str_map("find"), dtypes="int64")
     str_findall = Map.register(_str_map("findall"), dtypes="copy")
     str_get = Map.register(_str_map("get"), dtypes="copy")
-    str_index = Map.register(_str_map("index"), dtypes=np.int64)
-    str_isalnum = Map.register(_str_map("isalnum"), dtypes=np.bool_)
-    str_isalpha = Map.register(_str_map("isalpha"), dtypes=np.bool_)
-    str_isdecimal = Map.register(_str_map("isdecimal"), dtypes=np.bool_)
-    str_isdigit = Map.register(_str_map("isdigit"), dtypes=np.bool_)
-    str_islower = Map.register(_str_map("islower"), dtypes=np.bool_)
-    str_isnumeric = Map.register(_str_map("isnumeric"), dtypes=np.bool_)
-    str_isspace = Map.register(_str_map("isspace"), dtypes=np.bool_)
-    str_istitle = Map.register(_str_map("istitle"), dtypes=np.bool_)
-    str_isupper = Map.register(_str_map("isupper"), dtypes=np.bool_)
+    str_index = Map.register(_str_map("index"), dtypes="int64")
+    str_isalnum = Map.register(_str_map("isalnum"), dtypes="bool")
+    str_isalpha = Map.register(_str_map("isalpha"), dtypes="bool")
+    str_isdecimal = Map.register(_str_map("isdecimal"), dtypes="bool")
+    str_isdigit = Map.register(_str_map("isdigit"), dtypes="bool")
+    str_islower = Map.register(_str_map("islower"), dtypes="bool")
+    str_isnumeric = Map.register(_str_map("isnumeric"), dtypes="bool")
+    str_isspace = Map.register(_str_map("isspace"), dtypes="bool")
+    str_istitle = Map.register(_str_map("istitle"), dtypes="bool")
+    str_isupper = Map.register(_str_map("isupper"), dtypes="bool")
     str_join = Map.register(_str_map("join"), dtypes="copy")
-    str_len = Map.register(_str_map("len"), dtypes=int)
+    str_len = Map.register(_str_map("len"), dtypes="int64")
     str_ljust = Map.register(_str_map("ljust"), dtypes="copy")
     str_lower = Map.register(_str_map("lower"), dtypes="copy")
     str_lstrip = Map.register(_str_map("lstrip"), dtypes="copy")
@@ -1961,8 +1961,8 @@ def str_extract(self, pat, flags, expand):
         return qc
 
     str_replace = Map.register(_str_map("replace"), dtypes="copy", shape_hint="column")
-    str_rfind = Map.register(_str_map("rfind"), dtypes=np.int64, shape_hint="column")
-    str_rindex = Map.register(_str_map("rindex"), dtypes=np.int64, shape_hint="column")
+    str_rfind = Map.register(_str_map("rfind"), dtypes="int64", shape_hint="column")
+    str_rindex = Map.register(_str_map("rindex"), dtypes="int64", shape_hint="column")
     str_rjust = Map.register(_str_map("rjust"), dtypes="copy", shape_hint="column")
     _str_rpartition = Map.register(
         _str_map("rpartition"), dtypes="copy", shape_hint="column"
@@ -1996,7 +1996,7 @@ def str_split(self, pat=None, n=-1, expand=False, regex=None):
         return self._str_split(pat=pat, n=n, expand=False, regex=regex)
 
     str_startswith = Map.register(
-        _str_map("startswith"), dtypes=np.bool_, shape_hint="column"
+        _str_map("startswith"), dtypes="bool", shape_hint="column"
     )
     str_strip = Map.register(_str_map("strip"), dtypes="copy", shape_hint="column")
     str_swapcase = Map.register(
@@ -2068,51 +2068,49 @@ def searchsorted(df):
 
     # Dt map partitions operations
 
-    dt_date = Map.register(_dt_prop_map("date"), dtypes=np.object_)
-    dt_time = Map.register(_dt_prop_map("time"), dtypes=np.object_)
-    dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes=np.object_)
-    dt_year = Map.register(_dt_prop_map("year"), dtypes=np.int32)
-    dt_month = Map.register(_dt_prop_map("month"), dtypes=np.int32)
-    dt_day = Map.register(_dt_prop_map("day"), dtypes=np.int32)
-    dt_hour = Map.register(_dt_prop_map("hour"), dtypes=np.int64)
-    dt_minute = Map.register(_dt_prop_map("minute"), dtypes=np.int64)
-    dt_second = Map.register(_dt_prop_map("second"), dtypes=np.int64)
-    dt_microsecond = Map.register(_dt_prop_map("microsecond"), dtypes=np.int64)
-    dt_nanosecond = Map.register(_dt_prop_map("nanosecond"), dtypes=np.int64)
-    dt_dayofweek = Map.register(_dt_prop_map("dayofweek"), dtypes=np.int64)
-    dt_weekday = Map.register(_dt_prop_map("weekday"), dtypes=np.int64)
-    dt_dayofyear = Map.register(_dt_prop_map("dayofyear"), dtypes=np.int64)
-    dt_quarter = Map.register(_dt_prop_map("quarter"), dtypes=np.int64)
-    dt_is_month_start = Map.register(_dt_prop_map("is_month_start"), dtypes=np.bool_)
-    dt_is_month_end = Map.register(_dt_prop_map("is_month_end"), dtypes=np.bool_)
-    dt_is_quarter_start = Map.register(
-        _dt_prop_map("is_quarter_start"), dtypes=np.bool_
-    )
-    dt_is_quarter_end = Map.register(_dt_prop_map("is_quarter_end"), dtypes=np.bool_)
-    dt_is_year_start = Map.register(_dt_prop_map("is_year_start"), dtypes=np.bool_)
-    dt_is_year_end = Map.register(_dt_prop_map("is_year_end"), dtypes=np.bool_)
-    dt_is_leap_year = Map.register(_dt_prop_map("is_leap_year"), dtypes=np.bool_)
-    dt_daysinmonth = Map.register(_dt_prop_map("daysinmonth"), dtypes=np.int64)
-    dt_days_in_month = Map.register(_dt_prop_map("days_in_month"), dtypes=np.int64)
+    dt_date = Map.register(_dt_prop_map("date"), dtypes="object")
+    dt_time = Map.register(_dt_prop_map("time"), dtypes="object")
+    dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes="object")
+    dt_year = Map.register(_dt_prop_map("year"), dtypes="int32")
+    dt_month = Map.register(_dt_prop_map("month"), dtypes="int32")
+    dt_day = Map.register(_dt_prop_map("day"), dtypes="int32")
+    dt_hour = Map.register(_dt_prop_map("hour"), dtypes="int64")
+    dt_minute = Map.register(_dt_prop_map("minute"), dtypes="int64")
+    dt_second = Map.register(_dt_prop_map("second"), dtypes="int64")
+    dt_microsecond = Map.register(_dt_prop_map("microsecond"), dtypes="int64")
+    dt_nanosecond = Map.register(_dt_prop_map("nanosecond"), dtypes="int64")
+    dt_dayofweek = Map.register(_dt_prop_map("dayofweek"), dtypes="int64")
+    dt_weekday = Map.register(_dt_prop_map("weekday"), dtypes="int64")
+    dt_dayofyear = Map.register(_dt_prop_map("dayofyear"), dtypes="int64")
+    dt_quarter = Map.register(_dt_prop_map("quarter"), dtypes="int64")
+    dt_is_month_start = Map.register(_dt_prop_map("is_month_start"), dtypes="bool")
+    dt_is_month_end = Map.register(_dt_prop_map("is_month_end"), dtypes="bool")
+    dt_is_quarter_start = Map.register(_dt_prop_map("is_quarter_start"), dtypes="bool")
+    dt_is_quarter_end = Map.register(_dt_prop_map("is_quarter_end"), dtypes="bool")
+    dt_is_year_start = Map.register(_dt_prop_map("is_year_start"), dtypes="bool")
+    dt_is_year_end = Map.register(_dt_prop_map("is_year_end"), dtypes="bool")
+    dt_is_leap_year = Map.register(_dt_prop_map("is_leap_year"), dtypes="bool")
+    dt_daysinmonth = Map.register(_dt_prop_map("daysinmonth"), dtypes="int64")
+    dt_days_in_month = Map.register(_dt_prop_map("days_in_month"), dtypes="int64")
     dt_asfreq = Map.register(_dt_func_map("asfreq"))
     dt_to_period = Map.register(_dt_func_map("to_period"))
-    dt_to_pydatetime = Map.register(_dt_func_map("to_pydatetime"), dtypes=np.object_)
+    dt_to_pydatetime = Map.register(_dt_func_map("to_pydatetime"), dtypes="object")
     dt_tz_localize = Map.register(_dt_func_map("tz_localize"))
     dt_tz_convert = Map.register(_dt_func_map("tz_convert"))
     dt_normalize = Map.register(_dt_func_map("normalize"))
-    dt_strftime = Map.register(_dt_func_map("strftime"), dtypes=np.object_)
+    dt_strftime = Map.register(_dt_func_map("strftime"), dtypes="object")
     dt_round = Map.register(_dt_func_map("round"))
     dt_floor = Map.register(_dt_func_map("floor"))
     dt_ceil = Map.register(_dt_func_map("ceil"))
-    dt_month_name = Map.register(_dt_func_map("month_name"), dtypes=np.object_)
-    dt_day_name = Map.register(_dt_func_map("day_name"), dtypes=np.object_)
-    dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes=np.object_)
-    dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes=np.float64)
-    dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes=np.int64)
-    dt_days = Map.register(_dt_prop_map("days"), dtypes=np.int64)
-    dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes=np.int64)
-    dt_nanoseconds = Map.register(_dt_prop_map("nanoseconds"), dtypes=np.int64)
-    dt_qyear = Map.register(_dt_prop_map("qyear"), dtypes=np.int64)
+    dt_month_name = Map.register(_dt_func_map("month_name"), dtypes="object")
+    dt_day_name = Map.register(_dt_func_map("day_name"), dtypes="object")
+    dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes="object")
+    dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes="float64")
+    dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes="int64")
+    dt_days = Map.register(_dt_prop_map("days"), dtypes="int64")
+    dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes="int64")
+    dt_nanoseconds = Map.register(_dt_prop_map("nanoseconds"), dtypes="int64")
+    dt_qyear = Map.register(_dt_prop_map("qyear"), dtypes="int64")
     dt_start_time = Map.register(_dt_prop_map("start_time"))
     dt_end_time = Map.register(_dt_prop_map("end_time"))
     dt_to_timestamp = Map.register(_dt_func_map("to_timestamp"))
@@ -2284,7 +2282,7 @@ def map_func(df):  # pragma: no cover
             n_rows = df.shape[0]
             df_mask = np.isfinite(df)
 
-            result = np.empty((n_rows, n_cols), dtype=np.float64)
+            result = np.empty((n_rows, n_cols), dtype="float64")
 
             for i in range(n_rows):
                 df_ith_row = df[i]
@@ -2636,7 +2634,7 @@ def quantile_builder(df, **kwargs):
             lambda df: quantile_builder(df, **kwargs),
             new_index=q_index,
             new_columns=new_columns,
-            dtypes=np.float64,
+            dtypes="float64",
         )
         result = self.__constructor__(new_modin_frame)
         return result.transpose() if axis == 1 else result
@@ -2653,7 +2651,7 @@ def rank(self, **kwargs):
                 if not numeric_only
                 else None
             ),
-            dtypes=np.float64,
+            dtypes="float64",
             sync_labels=False,
         )
         return self.__constructor__(new_modin_frame)
@@ -3163,7 +3161,7 @@ def _compute_duplicated(df):  # pragma: no cover
             func=_compute_duplicated,
             new_index=self._modin_frame.copy_index_cache(),
             new_columns=[MODIN_UNNAMED_SERIES_LABEL],
-            dtypes=np.bool_,
+            dtypes="bool",
             keep_partitioning=True,
         )
         return self.__constructor__(new_modin_frame, shape_hint="column")
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py
index 7099751dafe..b00e73dc745 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py
@@ -67,6 +67,7 @@ class CalciteSerializer:
         "datetime64": "TIMESTAMP",
     }
 
+    # TODO: Is it necessary to use more general types here (not dependent on NumPy)?
     _INT_OPTS = {
         np.int8: ("TINYINT", 3),
         np.int16: ("SMALLINT", 5),
diff --git a/modin/numpy/indexing.py b/modin/numpy/indexing.py
index b598577a34d..4223ae3e513 100644
--- a/modin/numpy/indexing.py
+++ b/modin/numpy/indexing.py
@@ -214,7 +214,7 @@ def boolean_mask_to_numeric(indexer):
             # `itertools.compress` masks `data` with the `selectors` mask,
             # works about ~10% faster than a pure list comprehension
             itertools.compress(data=range(len(indexer)), selectors=indexer),
-            dtype=np.int64,
+            dtype="int64",
         )
 
 
@@ -585,7 +585,7 @@ def _compute_lookup(self, row_loc, col_loc):
                     # `Index.__getitem__` works much faster with numpy arrays than with python lists,
                     # so although we lose some time here on converting to numpy, `Index.__getitem__`
                     # speedup covers the loss that we gain here.
-                    axis_loc = np.array(axis_loc, dtype=np.int64)
+                    axis_loc = np.array(axis_loc, dtype="int64")
                 # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation
                 # if there're no negative indices and so they don't not depend on the axis length.
                 if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any():
diff --git a/modin/pandas/base.py b/modin/pandas/base.py
index 282d27eaf67..81dc3b03a46 100644
--- a/modin/pandas/base.py
+++ b/modin/pandas/base.py
@@ -1530,7 +1530,7 @@ def eq(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get equality of `BasePandasDataset` and `other`, element-wise (binary operator `eq`).
         """
-        return self._binary_op("eq", other, axis=axis, level=level, dtypes=np.bool_)
+        return self._binary_op("eq", other, axis=axis, level=level, dtypes="bool")
 
     def explode(self, column, ignore_index: bool = False):  # noqa: PR01, RT01, D200
         """
@@ -1831,7 +1831,7 @@ def ge(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get greater than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ge`).
         """
-        return self._binary_op("ge", other, axis=axis, level=level, dtypes=np.bool_)
+        return self._binary_op("ge", other, axis=axis, level=level, dtypes="bool")
 
     def get(self, key, default=None):  # noqa: PR01, RT01, D200
         """
@@ -1847,7 +1847,7 @@ def gt(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get greater than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `gt`).
         """
-        return self._binary_op("gt", other, axis=axis, level=level, dtypes=np.bool_)
+        return self._binary_op("gt", other, axis=axis, level=level, dtypes="bool")
 
     def head(self, n=5):  # noqa: PR01, RT01, D200
         """
@@ -1979,13 +1979,13 @@ def le(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get less than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `le`).
         """
-        return self._binary_op("le", other, axis=axis, level=level, dtypes=np.bool_)
+        return self._binary_op("le", other, axis=axis, level=level, dtypes="bool")
 
     def lt(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get less than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `lt`).
         """
-        return self._binary_op("lt", other, axis=axis, level=level, dtypes=np.bool_)
+        return self._binary_op("lt", other, axis=axis, level=level, dtypes="bool")
 
     @property
     def loc(self):  # noqa: RT01, D200
@@ -2194,7 +2194,7 @@ def ne(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get Not equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ne`).
         """
-        return self._binary_op("ne", other, axis=axis, level=level, dtypes=np.bool_)
+        return self._binary_op("ne", other, axis=axis, level=level, dtypes="bool")
 
     def notna(self):  # noqa: RT01, D200
         """
diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py
index d901b6dac99..316a75f82a7 100644
--- a/modin/pandas/indexing.py
+++ b/modin/pandas/indexing.py
@@ -232,7 +232,7 @@ def boolean_mask_to_numeric(indexer):
             # `itertools.compress` masks `data` with the `selectors` mask,
             # works about ~10% faster than a pure list comprehension
             itertools.compress(data=range(len(indexer)), selectors=indexer),
-            dtype=np.int64,
+            dtype="int64",
         )
 
 
@@ -1130,7 +1130,7 @@ def _compute_lookup(self, row_loc, col_loc):
                     # `Index.__getitem__` works much faster with numpy arrays than with python lists,
                     # so although we lose some time here on converting to numpy, `Index.__getitem__`
                     # speedup covers the loss that we gain here.
-                    axis_loc = np.array(axis_loc, dtype=np.int64)
+                    axis_loc = np.array(axis_loc, dtype="int64")
                 # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation
                 # if there're no negative indices and so they don't not depend on the axis length.
                 if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any():

From cb90479c826b465349fbf7c08181190dfa4715e4 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 19 Apr 2024 16:39:46 +0200
Subject: [PATCH 09/50] try another dtype_backend

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/tests/pandas/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py
index f374071cef8..f34ff0e7c94 100644
--- a/modin/tests/pandas/utils.py
+++ b/modin/tests/pandas/utils.py
@@ -1088,8 +1088,13 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs):
 
 def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, pandas.DataFrame]:
     post_fn = kwargs.pop("post_fn", lambda df: df)
+    post_fn2 = lambda df: post_fn(df).convert_dtypes(
+        dtype_backend="dtype_backend"
+    )  # noqa: E731
     return tuple(
-        map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)])
+        map(
+            post_fn2, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)]
+        )
     )
 
 
@@ -1103,6 +1108,9 @@ def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Se
     if sort:
         modin_series = modin_series.sort_values().reset_index(drop=True)
         pandas_series = pandas_series.sort_values().reset_index(drop=True)
+
+    modin_series = modin_series.convert_dtypes(dtype_backend="dtype_backend")
+    pandas_series = pandas_series.convert_dtypes(dtype_backend="dtype_backend")
     return modin_series, pandas_series
 
 

From f9b25607b6267223f7f362b41549beb693860113 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 19 Apr 2024 16:44:17 +0200
Subject: [PATCH 10/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/tests/pandas/utils.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py
index f34ff0e7c94..4a9be1dd58a 100644
--- a/modin/tests/pandas/utils.py
+++ b/modin/tests/pandas/utils.py
@@ -1087,14 +1087,12 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs):
 
 
 def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, pandas.DataFrame]:
-    post_fn = kwargs.pop("post_fn", lambda df: df)
-    post_fn2 = lambda df: post_fn(df).convert_dtypes(
-        dtype_backend="dtype_backend"
-    )  # noqa: E731
+    post_fn = kwargs.pop("post_fn", None)
+
+    if post_fn is None:
+        post_fn = lambda df: df.convert_dtypes(dtype_backend="pyarrow")  # noqa: E731
     return tuple(
-        map(
-            post_fn2, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)]
-        )
+        map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)])
     )
 
 
@@ -1109,8 +1107,8 @@ def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Se
         modin_series = modin_series.sort_values().reset_index(drop=True)
         pandas_series = pandas_series.sort_values().reset_index(drop=True)
 
-    modin_series = modin_series.convert_dtypes(dtype_backend="dtype_backend")
-    pandas_series = pandas_series.convert_dtypes(dtype_backend="dtype_backend")
+    modin_series = modin_series.convert_dtypes(dtype_backend="pyarrow")
+    pandas_series = pandas_series.convert_dtypes(dtype_backend="pyarrow")
     return modin_series, pandas_series
 
 

From ddcda4ff88abbe94e3e2f60b575eb93269819431 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Thu, 2 May 2024 16:50:41 +0200
Subject: [PATCH 11/50] fixes

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/algebra/binary.py        | 17 +++++++---
 .../dataframe/pandas/dataframe/dataframe.py   |  9 ++++-
 .../core/dataframe/pandas/metadata/dtypes.py  | 33 ++++++++++---------
 modin/tests/pandas/dataframe/test_default.py  |  2 +-
 modin/tests/pandas/utils.py                   |  2 ++
 5 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py
index 6af31ab826c..5138d728aba 100644
--- a/modin/core/dataframe/algebra/binary.py
+++ b/modin/core/dataframe/algebra/binary.py
@@ -179,7 +179,7 @@ def maybe_build_dtypes_series(
         First operand for which the binary operation would be performed later.
     second : PandasQueryCompiler, list-like or scalar
         Second operand for which the binary operation would be performed later.
-    dtype : np.dtype
+    dtype : pandas supported dtype
         Dtype of the result.
     trigger_computations : bool, default: False
         Whether to trigger computation of the lazy metadata for `first` and `second`.
@@ -250,10 +250,19 @@ def try_compute_new_dtypes(
 
     try:
         if infer_dtypes == "bool" or is_bool_dtype(result_dtype):
-            # FIXME: https://github.com/modin-project/modin/issues/7203
-            # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data
+            # dataframe can contain types of different backends at the same time, for example:
+            # (Pdb) (pandas.DataFrame([[1,2,3], [4,5,6]]).astype({0: "int64[pyarrow]"}) > 4).dtypes
+            # 0    bool[pyarrow]
+            # 1             bool
+            # 2             bool
+            # dtype: object
+            backend = ""
+            if any("pyarrow" in str(x) for x in first.dtypes) or any(
+                "pyarrow" in str(x) for x in second.dtypes
+            ):
+                backend = "pyarrow"
             dtypes = maybe_build_dtypes_series(
-                first, second, dtype=pandas.api.types.pandas_dtype(bool)
+                first, second, dtype=pandas.api.types.pandas_dtype(f"bool[{backend}]")
             )
         elif infer_dtypes == "common_cast":
             dtypes = maybe_compute_dtypes_common_cast(
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index bf13942b594..c0989b246c4 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -1269,8 +1269,15 @@ def _take_2d_positional(
                 new_dtypes = self.dtypes.iloc[monotonic_col_idx]
             elif isinstance(self._dtypes, ModinDtypes):
                 try:
+                    supported_monotonic_col_idx = monotonic_col_idx
+                    if isinstance(monotonic_col_idx, slice):
+                        supported_monotonic_col_idx = pandas.RangeIndex(
+                            monotonic_col_idx.start,
+                            monotonic_col_idx.stop,
+                            monotonic_col_idx.step,
+                        ).to_list()
                     new_dtypes = self._dtypes.lazy_get(
-                        monotonic_col_idx, numeric_index=True
+                        supported_monotonic_col_idx, numeric_index=True
                     )
                 # can raise either on missing cache or on duplicated labels
                 except (ValueError, NotImplementedError):
diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py
index b904c6fbff6..88f575d7288 100644
--- a/modin/core/dataframe/pandas/metadata/dtypes.py
+++ b/modin/core/dataframe/pandas/metadata/dtypes.py
@@ -496,21 +496,18 @@ def _merge_dtypes(
                 # in the 'dtypes_matrix'
                 series = pandas.Series(dtypes, name=i)
                 dtypes_matrix = pandas.concat([dtypes_matrix, series], axis=1)
-                dtypes_matrix.fillna(
-                    value={
-                        # If we encountered a 'NaN' while 'val' describes all the columns, then
-                        # it means, that the missing columns for this instance will be filled with NaNs (floats),
-                        # otherwise, it may indicate missing columns that this 'val' has no info about,
-                        # meaning that we shouldn't try computing a new dtype for this column,
-                        # so marking it as 'unknown'
-                        i: (
-                            pandas.api.types.pandas_dtype(float)
-                            if val._know_all_names and val._remaining_dtype is None
-                            else "unknown"
-                        )
-                    },
-                    inplace=True,
-                )
+                if val._know_all_names and val._remaining_dtype is None:
+                    dtypes_matrix.fillna(
+                        value={
+                            # If we encountered a 'NaN' while 'val' describes all the columns, then
+                            # it means, that the missing columns for this instance will be filled with NaNs (floats),
+                            # otherwise, it may indicate missing columns that this 'val' has no info about,
+                            # meaning that we shouldn't try computing a new dtype for this column,
+                            # so marking it as 'unknown'
+                            i: "unknown",
+                        },
+                        inplace=True,
+                    )
             elif isinstance(val, pandas.Series):
                 dtypes_matrix = pandas.concat([dtypes_matrix, val], axis=1)
             elif val is None:
@@ -531,7 +528,11 @@ def _merge_dtypes(
         def combine_dtypes(row):
             if (row == "unknown").any():
                 return "unknown"
-            row = row.fillna(pandas.api.types.pandas_dtype("float"))
+            if any("pyarrow" in str(x) for x in row):
+                # nans can be stored not only in float types, for example in `bool[pyarrow]`
+                row = row[~row.isna()]
+            else:
+                row = row.fillna(pandas.api.types.pandas_dtype("float"))
             return find_common_type(list(row.values))
 
         dtypes = dtypes_matrix.apply(combine_dtypes, axis=1)
diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py
index 28c2c20a53e..40e14eb55c9 100644
--- a/modin/tests/pandas/dataframe/test_default.py
+++ b/modin/tests/pandas/dataframe/test_default.py
@@ -715,7 +715,7 @@ def test_pivot_table_data(data, index, columns, values, aggfunc, request):
         "callable_tree_reduce_func" in request.node.callspec.id
         and "int_data" in request.node.callspec.id
     ):
-        expected_exception = TypeError("'numpy.float64' object is not callable")
+        expected_exception = TypeError("'float' object is not callable")
 
     eval_general(
         md_df,
diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py
index 4a9be1dd58a..b88a5d0a0b0 100644
--- a/modin/tests/pandas/utils.py
+++ b/modin/tests/pandas/utils.py
@@ -1090,6 +1090,7 @@ def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, pandas.DataFrame]:
     post_fn = kwargs.pop("post_fn", None)
 
     if post_fn is None:
+        # TODO: REVERT ME
         post_fn = lambda df: df.convert_dtypes(dtype_backend="pyarrow")  # noqa: E731
     return tuple(
         map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)])
@@ -1107,6 +1108,7 @@ def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Se
         modin_series = modin_series.sort_values().reset_index(drop=True)
         pandas_series = pandas_series.sort_values().reset_index(drop=True)
 
+    # TODO: REVERT ME
     modin_series = modin_series.convert_dtypes(dtype_backend="pyarrow")
     pandas_series = pandas_series.convert_dtypes(dtype_backend="pyarrow")
     return modin_series, pandas_series

From afae62fd4e2e239e15c7c656148249d20c5c723d Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Thu, 2 May 2024 17:52:40 +0200
Subject: [PATCH 12/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/algebra/binary.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py
index 5138d728aba..019d7280289 100644
--- a/modin/core/dataframe/algebra/binary.py
+++ b/modin/core/dataframe/algebra/binary.py
@@ -260,9 +260,9 @@ def try_compute_new_dtypes(
             if any("pyarrow" in str(x) for x in first.dtypes) or any(
                 "pyarrow" in str(x) for x in second.dtypes
             ):
-                backend = "pyarrow"
+                backend = "[pyarrow]"
             dtypes = maybe_build_dtypes_series(
-                first, second, dtype=pandas.api.types.pandas_dtype(f"bool[{backend}]")
+                first, second, dtype=pandas.api.types.pandas_dtype(f"bool{backend}")
             )
         elif infer_dtypes == "common_cast":
             dtypes = maybe_compute_dtypes_common_cast(

From 316cddb4edbaf1214f4f696868f9ccd1bf0bb0e0 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Thu, 2 May 2024 18:31:30 +0200
Subject: [PATCH 13/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/pandas/base.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/modin/pandas/base.py b/modin/pandas/base.py
index 81dc3b03a46..013f90031bc 100644
--- a/modin/pandas/base.py
+++ b/modin/pandas/base.py
@@ -351,7 +351,7 @@ def _validate_other(
                 if label in other
             ]
         else:
-            other_dtypes = [type(x) for x in other]
+            other_dtypes = [x if pandas.isna(x) else type(x) for x in other]
         if compare_index:
             if not self.index.equals(other.index):
                 raise TypeError("Cannot perform operation with non-equal index")
@@ -371,17 +371,18 @@ def _validate_other(
             # TODO(https://github.com/modin-project/modin/issues/5239):
             # this spuriously rejects other that is a list including some
             # custom type that can be added to self's elements.
-            if not all(
-                (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype))
-                or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype))
-                or (
-                    lib.is_np_dtype(self_dtype, "mM")
-                    and lib.is_np_dtype(self_dtype, "mM")
-                )
-                or is_dtype_equal(self_dtype, other_dtype)
-                for self_dtype, other_dtype in zip(self_dtypes, other_dtypes)
-            ):
-                raise TypeError("Cannot do operation with improper dtypes")
+            for self_dtype, other_dtype in zip(self_dtypes, other_dtypes):
+                if not (
+                    (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype))
+                    or (is_numeric_dtype(self_dtype) and pandas.isna(other_dtype))
+                    or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype))
+                    or (
+                        lib.is_np_dtype(self_dtype, "mM")
+                        and lib.is_np_dtype(self_dtype, "mM")
+                    )
+                    or is_dtype_equal(self_dtype, other_dtype)
+                ):
+                    raise TypeError("Cannot do operation with improper dtypes")
         return result
 
     def _validate_function(self, func, on_invalid=None):

From 639c2edbb3e4baf34d527ee5e5e163b4f41547ef Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Thu, 2 May 2024 20:39:35 +0200
Subject: [PATCH 14/50] fix pivot_table

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/storage_formats/pandas/groupby.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modin/core/storage_formats/pandas/groupby.py b/modin/core/storage_formats/pandas/groupby.py
index e327efbda4a..4b22b5c0158 100644
--- a/modin/core/storage_formats/pandas/groupby.py
+++ b/modin/core/storage_formats/pandas/groupby.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import pandas
+from pandas.core.dtypes.cast import find_common_type
 
 from modin.config import use_range_partitioning_groupby
 from modin.core.dataframe.algebra import GroupByReduce
@@ -358,7 +359,9 @@ def applyier(df, other):  # pragma: no cover
             # transposing it back to be consistent with column axis values along
             # different partitions
             if len(index) == 0 and len(columns) > 0:
-                result = result.transpose()
+                common_type = find_common_type(result.dtypes.tolist())
+                # transpose loses dtypes: https://github.com/pandas-dev/pandas/issues/43337
+                result = result.transpose().astype(common_type, copy=False)
 
             return result
 

From 05f32e51d683c99c5b39e2920fe8630c993a9da6 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Thu, 2 May 2024 22:32:08 +0200
Subject: [PATCH 15/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/tests/pandas/dataframe/test_default.py | 1 +
 modin/tests/pandas/utils.py                  | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py
index 40e14eb55c9..0f3ca39fd72 100644
--- a/modin/tests/pandas/dataframe/test_default.py
+++ b/modin/tests/pandas/dataframe/test_default.py
@@ -634,6 +634,7 @@ def test_pivot(data, index, columns, values, request):
         expected_exception = ValueError(
             "Index contains duplicate entries, cannot reshape"
         )
+    # failed because pandas doesn't preserve dtype backend
     eval_general(
         *create_test_dfs(data),
         lambda df, *args, **kwargs: df.pivot(*args, **kwargs),
diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py
index b88a5d0a0b0..b04026393a9 100644
--- a/modin/tests/pandas/utils.py
+++ b/modin/tests/pandas/utils.py
@@ -662,9 +662,10 @@ def assert_dtypes_equal(df1, df2):
         lambda obj: isinstance(obj, pandas.PeriodDtype),
     )
 
-    for col in dtypes1.keys():
+    # `test_pivot_table_margins` failed due to usage ``pd.NA`` in column name
+    for idx in range(len(dtypes1)):
         for comparator in dtype_comparators:
-            if assert_all_act_same(comparator, dtypes1[col], dtypes2[col]):
+            if assert_all_act_same(comparator, dtypes1.iloc[idx], dtypes2.iloc[idx]):
                 # We met a dtype that both types satisfy, so we can stop iterating
                 # over comparators and compare next dtypes
                 break

From 194cc68c553c98d8acac5edd44c5905729ffaad2 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 3 May 2024 16:27:27 +0200
Subject: [PATCH 16/50] find potential problem areas at the query_compiler
 level

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/algebra/binary.py        | 26 ++++--
 modin/core/dataframe/algebra/map.py           |  9 +-
 modin/core/dataframe/algebra/tree_reduce.py   |  2 +-
 .../dataframe/pandas/dataframe/dataframe.py   | 28 +++---
 .../core/dataframe/pandas/metadata/dtypes.py  | 29 ++++---
 modin/core/dataframe/pandas/metadata/index.py |  5 +-
 .../storage_formats/base/query_compiler.py    |  7 +-
 .../storage_formats/pandas/query_compiler.py  | 85 ++++++++++++-------
 modin/pandas/utils.py                         |  1 +
 9 files changed, 126 insertions(+), 66 deletions(-)

diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py
index 019d7280289..acc84460b0e 100644
--- a/modin/core/dataframe/algebra/binary.py
+++ b/modin/core/dataframe/algebra/binary.py
@@ -13,8 +13,10 @@
 
 """Module houses builder class for Binary operator."""
 
+from __future__ import annotations
+
 import warnings
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
 import numpy as np
 import pandas
@@ -24,9 +26,12 @@
 
 from .operator import Operator
 
+if TYPE_CHECKING:
+    from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
+
 
 def maybe_compute_dtypes_common_cast(
-    first,
+    first: PandasQueryCompiler,
     second,
     trigger_computations=False,
     axis=0,
@@ -80,6 +85,7 @@ def maybe_compute_dtypes_common_cast(
         # belong to the intersection, these will be NaN columns in the result
         mismatch_columns = columns_first ^ columns_second
     elif isinstance(second, dict):
+        # TODO: pyarrow backend
         dtypes_second = {
             key: pandas.api.types.pandas_dtype(type(value))
             for key, value in second.items()
@@ -92,6 +98,7 @@ def maybe_compute_dtypes_common_cast(
         mismatch_columns = columns_first.difference(columns_second)
     else:
         if isinstance(second, (list, tuple)):
+            # TODO: pyarrow backend
             second_dtypes_list = (
                 [pandas.api.types.pandas_dtype(type(value)) for value in second]
                 if axis == 1
@@ -100,6 +107,7 @@ def maybe_compute_dtypes_common_cast(
                 else [np.array(second).dtype] * len(dtypes_first)
             )
         elif is_scalar(second) or isinstance(second, np.ndarray):
+            # TODO: pyarrow backend
             try:
                 dtype = getattr(second, "dtype", None) or pandas.api.types.pandas_dtype(
                     type(second)
@@ -125,6 +133,7 @@ def maybe_compute_dtypes_common_cast(
         mismatch_columns = []
 
     # If at least one column doesn't match, the result of the non matching column would be nan.
+    # TODO: pyarrow backend
     nan_dtype = pandas.api.types.pandas_dtype(type(np.nan))
     dtypes = None
     if func is not None:
@@ -168,7 +177,7 @@ def maybe_compute_dtypes_common_cast(
 
 
 def maybe_build_dtypes_series(
-    first, second, dtype, trigger_computations=False
+    first: PandasQueryCompiler, second, dtype, trigger_computations=False
 ) -> Optional[pandas.Series]:
     """
     Build a ``pandas.Series`` describing dtypes of the result of a binary operation.
@@ -217,8 +226,13 @@ def maybe_build_dtypes_series(
 
 
 def try_compute_new_dtypes(
-    first, second, infer_dtypes=None, result_dtype=None, axis=0, func=None
-):
+    first: PandasQueryCompiler,
+    second,
+    infer_dtypes=None,
+    result_dtype=None,
+    axis=0,
+    func=None,
+) -> Optional[pandas.Series]:
     """
     Precompute resulting dtypes of the binary operation if possible.
 
@@ -235,7 +249,7 @@ def try_compute_new_dtypes(
     infer_dtypes : {"common_cast", "try_sample", "bool", None}, default: None
         How dtypes should be infered (see ``Binary.register`` doc for more info).
     result_dtype : np.dtype, optional
-        NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter.
+        NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter. Only NumPy?
     axis : int, default: 0
         Axis to perform the binary operation along.
     func : callable(pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame, optional
diff --git a/modin/core/dataframe/algebra/map.py b/modin/core/dataframe/algebra/map.py
index 57b21f6e1b0..aefebe6c017 100644
--- a/modin/core/dataframe/algebra/map.py
+++ b/modin/core/dataframe/algebra/map.py
@@ -13,8 +13,15 @@
 
 """Module houses builder class for Map operator."""
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 from .operator import Operator
 
+if TYPE_CHECKING:
+    from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
+
 
 class Map(Operator):
     """Builder class for Map operator."""
@@ -41,7 +48,7 @@ def register(cls, function, *call_args, **call_kwds):
             Function that takes query compiler and executes map function.
         """
 
-        def caller(query_compiler, *args, **kwargs):
+        def caller(query_compiler: PandasQueryCompiler, *args, **kwargs):
             """Execute Map function against passed query compiler."""
             shape_hint = call_kwds.pop("shape_hint", None) or query_compiler._shape_hint
             return query_compiler.__constructor__(
diff --git a/modin/core/dataframe/algebra/tree_reduce.py b/modin/core/dataframe/algebra/tree_reduce.py
index fa7b731e6f5..8a30196cbeb 100644
--- a/modin/core/dataframe/algebra/tree_reduce.py
+++ b/modin/core/dataframe/algebra/tree_reduce.py
@@ -35,7 +35,7 @@ def register(
         axis : int, optional
             Specifies axis to apply function along.
         compute_dtypes : callable(pandas.Series, *func_args, **func_kwargs) -> np.dtype, optional
-            Callable for computing dtypes.
+            Callable for computing dtypes. Only NumPy?
 
         Returns
         -------
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index c0989b246c4..4a4bb8906d9 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -17,6 +17,9 @@
 PandasDataframe is a parent abstract class for any dataframe class
 for pandas storage format.
 """
+
+from __future__ import annotations
+
 import datetime
 import re
 from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Optional, Union
@@ -97,15 +100,18 @@ class PandasDataframe(
     # These properties flag whether or not we are deferring the metadata synchronization
     _deferred_index = False
     _deferred_column = False
+    _index_cache: ModinIndex = None
+    _columns_cache: ModinIndex = None
+    _dtypes: Optional[ModinDtypes] = None
 
     @pandas.util.cache_readonly
-    def __constructor__(self):
+    def __constructor__(self) -> Callable[..., PandasDataframe]:
         """
         Create a new instance of this object.
 
         Returns
         -------
-        PandasDataframe
+        callable
         """
         return type(self)
 
@@ -451,9 +457,6 @@ def dtype_builder(df):
         dtypes.name = None
         return dtypes
 
-    _index_cache = None
-    _columns_cache = None
-
     def set_index_cache(self, index):
         """
         Set index cache.
@@ -2230,6 +2233,7 @@ def map(
             if isinstance(new_columns, ModinIndex):
                 # Materializing lazy columns in order to build dtype's index
                 new_columns = new_columns.get(return_lengths=False)
+            # TODO: consider backend
             dtypes = pandas.Series(
                 [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
                 index=new_columns,
@@ -2894,7 +2898,7 @@ def apply_full_axis(
         enumerate_partitions : bool, default: False
             Whether pass partition index into applied `func` or not.
             Note that `func` must be able to obtain `partition_idx` kwarg.
-        dtypes : list-like, optional
+        dtypes : list-like or scalar, optional
             The data types of the result. This is an optimization
             because there are functions that always result in a particular data
             type, and allows us to avoid (re)computing it.
@@ -2948,7 +2952,7 @@ def apply_full_axis_select_indices(
         new_index=None,
         new_columns=None,
         keep_remaining=False,
-        new_dtypes=None,
+        new_dtypes: Optional[Union[pandas.Series, ModinDtypes]] = None,
     ):
         """
         Apply a function across an entire axis for a subset of the data.
@@ -3017,10 +3021,10 @@ def apply_select_indices(
         col_labels=None,
         new_index=None,
         new_columns=None,
-        new_dtypes=None,
+        new_dtypes: Optional[pandas.Series] = None,
         keep_remaining=False,
         item_to_distribute=no_default,
-    ):
+    ) -> PandasDataframe:
         """
         Apply a function for a subset of the data.
 
@@ -3405,12 +3409,12 @@ def broadcast_apply_full_axis(
         new_columns : list-like, optional
             Columns of the result. We may know this in
             advance, and if not provided it must be computed.
-        apply_indices : list-like, default: None
+        apply_indices : list-like, optional
             Indices of `axis ^ 1` to apply function over.
         enumerate_partitions : bool, default: False
             Whether pass partition index into applied `func` or not.
             Note that `func` must be able to obtain `partition_idx` kwarg.
-        dtypes : list-like, default: None
+        dtypes : list-like or scalar, optional
             Data types of the result. This is an optimization
             because there are functions that always result in a particular data
             type, and allows us to avoid (re)computing it.
@@ -3486,6 +3490,7 @@ def broadcast_apply_full_axis(
                 if new_columns is None:
                     kw["dtypes"] = ModinDtypes(
                         DtypesDescriptor(
+                            # TODO: pyarrow backend
                             remaining_dtype=pandas.api.types.pandas_dtype(dtypes)
                         )
                     )
@@ -3494,6 +3499,7 @@ def broadcast_apply_full_axis(
                         pandas.Series(dtypes, index=new_columns)
                         if is_list_like(dtypes)
                         else pandas.Series(
+                            # TODO: pyarrow backend
                             [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
                             index=new_columns,
                         )
diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py
index 88f575d7288..ec227cde32f 100644
--- a/modin/core/dataframe/pandas/metadata/dtypes.py
+++ b/modin/core/dataframe/pandas/metadata/dtypes.py
@@ -13,6 +13,8 @@
 
 """Module contains class ``ModinDtypes``."""
 
+from __future__ import annotations
+
 from typing import TYPE_CHECKING, Callable, Optional, Union
 
 import numpy as np
@@ -62,6 +64,7 @@ def __init__(
         self,
         known_dtypes: Optional[Union[dict[IndexLabel, np.dtype], pandas.Series]] = None,
         cols_with_unknown_dtypes: Optional[list[IndexLabel]] = None,
+        # TODO: what if there is a type of another backend
         remaining_dtype: Optional[np.dtype] = None,
         parent_df: Optional["PandasDataframe"] = None,
         columns_order: Optional[dict[int, IndexLabel]] = None,
@@ -747,9 +750,7 @@ class ModinDtypes:
 
     def __init__(
         self,
-        value: Optional[
-            Union[Callable, pandas.Series, DtypesDescriptor, "ModinDtypes"]
-        ],
+        value: Optional[Union[Callable, pandas.Series, DtypesDescriptor, ModinDtypes]],
     ):
         if callable(value) or isinstance(value, pandas.Series):
             self._value = value
@@ -779,6 +780,7 @@ def is_materialized(self) -> bool:
         """
         return isinstance(self._value, pandas.Series)
 
+    # TODO: pyarrow backend
     def get_dtypes_set(self) -> set[np.dtype]:
         """
         Get a set of dtypes from the descriptor.
@@ -793,9 +795,7 @@ def get_dtypes_set(self) -> set[np.dtype]:
             self.get()
         return set(self._value.values)
 
-    def maybe_specify_new_frame_ref(
-        self, new_parent: "PandasDataframe"
-    ) -> "ModinDtypes":
+    def maybe_specify_new_frame_ref(self, new_parent: PandasDataframe) -> ModinDtypes:
         """
         Set a new parent for the stored value if needed.
 
@@ -817,7 +817,7 @@ def maybe_specify_new_frame_ref(
             return new_self
         return new_self
 
-    def lazy_get(self, ids: list, numeric_index: bool = False) -> "ModinDtypes":
+    def lazy_get(self, ids: list, numeric_index: bool = False) -> ModinDtypes:
         """
         Get new ``ModinDtypes`` for a subset of columns without triggering any computations.
 
@@ -849,7 +849,7 @@ def lazy_get(self, ids: list, numeric_index: bool = False) -> "ModinDtypes":
         return ModinDtypes(self._value.iloc[ids] if numeric_index else self._value[ids])
 
     @classmethod
-    def concat(cls, values: list, axis: int = 0) -> "ModinDtypes":
+    def concat(cls, values: list, axis: int = 0) -> ModinDtypes:
         """
         Concatenate dtypes.
 
@@ -893,7 +893,7 @@ def concat(cls, values: list, axis: int = 0) -> "ModinDtypes":
             desc = pandas.concat(values)
         return ModinDtypes(desc)
 
-    def set_index(self, new_index: Union[pandas.Index, "ModinIndex"]) -> "ModinDtypes":
+    def set_index(self, new_index: Union[pandas.Index, ModinIndex]) -> ModinDtypes:
         """
         Set new column names for stored dtypes.
 
@@ -997,7 +997,7 @@ def __getattr__(self, name):
             self.get()
         return self._value.__getattribute__(name)
 
-    def copy(self) -> "ModinDtypes":
+    def copy(self) -> ModinDtypes:
         """
         Copy an object without materializing the internal representation.
 
@@ -1235,13 +1235,22 @@ def extract_dtype(value):
     from modin.pandas.utils import is_scalar
 
     if hasattr(value, "dtype"):
+        # If we're dealing with a numpy scalar (np.int, np.datetime64, ...)
+        # we would like to get its internal dtype
         return value.dtype
+    elif hasattr(value, "to_numpy"):
+        # If we're dealing with a scalar that can be converted to numpy (for example pandas.Timestamp)
+        # we would like to convert it and get its proper internal dtype
+        return value.to_numpy().dtype
     elif hasattr(value, "dtypes"):
         return value.dtypes
     elif is_scalar(value):
         if value is None:
             # previous type was object instead of 'float64'
             return pandas.api.types.pandas_dtype(value)
+        # TODO: backend is not taken into account
+        # pd.api.types.pandas_dtype(pd.ArrowDtype(pa.array([1,2,3]).type))
         return pandas.api.types.pandas_dtype(type(value))
     else:
+        # TODO: new way without numpy?
         return np.array(value).dtype
diff --git a/modin/core/dataframe/pandas/metadata/index.py b/modin/core/dataframe/pandas/metadata/index.py
index d5aa37e52a0..b731a99bc73 100644
--- a/modin/core/dataframe/pandas/metadata/index.py
+++ b/modin/core/dataframe/pandas/metadata/index.py
@@ -15,6 +15,7 @@
 
 import functools
 import uuid
+from typing import Optional
 
 import pandas
 from pandas.core.dtypes.common import is_list_like
@@ -44,7 +45,7 @@ class ModinIndex:
         Materialized dtypes of index levels.
     """
 
-    def __init__(self, value=None, axis=None, dtypes=None):
+    def __init__(self, value=None, axis=None, dtypes: Optional[pandas.Series] = None):
         from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe
 
         self._is_default_callable = False
@@ -69,7 +70,7 @@ def __init__(self, value=None, axis=None, dtypes=None):
         self._index_id = uuid.uuid4()
         self._lengths_id = uuid.uuid4()
 
-    def maybe_get_dtypes(self):
+    def maybe_get_dtypes(self) -> Optional[pandas.Series]:
         """
         Get index dtypes if available.
 
diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
index ae78bc27f09..c6e09d00dfb 100644
--- a/modin/core/storage_formats/base/query_compiler.py
+++ b/modin/core/storage_formats/base/query_compiler.py
@@ -17,9 +17,11 @@
 ``BaseQueryCompiler`` is a parent abstract class for any other query compiler class.
 """
 
+from __future__ import annotations
+
 import abc
 import warnings
-from typing import Hashable, List, Optional
+from typing import Callable, Hashable, List, Optional
 
 import numpy as np
 import pandas
@@ -4279,6 +4281,7 @@ def get_positions_from_labels(self, row_loc, col_loc):
                     # `Index.get_indexer_for` works much faster with numpy arrays than with python lists,
                     # so although we lose some time here on converting to numpy, `Index.get_indexer_for`
                     # speedup covers the loss that we gain here.
+                    # TODO: pyarrow backend?
                     axis_loc = np.array(axis_loc, dtype=axis_labels.dtype)
                 axis_lookup = axis_labels.get_indexer_for(axis_loc)
                 # `Index.get_indexer_for` sets -1 value for missing labels, we have to verify whether
@@ -4456,7 +4459,7 @@ def write_items(df, broadcasted_items):
     # END Abstract methods for QueryCompiler
 
     @pandas.util.cache_readonly
-    def __constructor__(self):
+    def __constructor__(self) -> Callable[..., BaseQueryCompiler]:
         """
         Get query compiler constructor.
 
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 060e7849768..b139ad4ff56 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -18,12 +18,14 @@
 queries for the ``PandasDataframe``.
 """
 
+from __future__ import annotations
+
 import ast
 import hashlib
 import re
 import warnings
 from collections.abc import Iterable
-from typing import Hashable, List
+from typing import TYPE_CHECKING, Hashable, List, Optional
 
 import numpy as np
 import pandas
@@ -37,6 +39,7 @@
     is_datetime64_any_dtype,
     is_list_like,
     is_numeric_dtype,
+    is_timedelta64_dtype,
 )
 from pandas.core.groupby.base import transformation_kernels
 from pandas.core.indexes.api import ensure_index_from_sequences
@@ -79,6 +82,9 @@
 from .merge import MergeImpl
 from .utils import get_group_names, merge_partitioning
 
+if TYPE_CHECKING:
+    from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe
+
 
 def _get_axis(axis):
     """
@@ -263,7 +269,10 @@ class PandasQueryCompiler(BaseQueryCompiler):
         Shape hint for frames known to be a column or a row, otherwise None.
     """
 
-    def __init__(self, modin_frame, shape_hint=None):
+    _modin_frame: PandasDataframe
+    _shape_hint: Optional[str]
+
+    def __init__(self, modin_frame: PandasDataframe, shape_hint: Optional[str] = None):
         self._modin_frame = modin_frame
         self._shape_hint = shape_hint
 
@@ -935,6 +944,7 @@ def compute_dtypes_fn(dtypes, axis, **kwargs):
                 and any(is_numeric_dtype(t) for t in dtypes)
             ):
                 return "object"
+            # how to take into account backend here?
             return "float64"
 
         return TreeReduce.register(
@@ -1846,39 +1856,41 @@ def isin_func(df, values):
 
     abs = Map.register(pandas.DataFrame.abs, dtypes="copy")
     map = Map.register(pandas.DataFrame.map)
+    # Will it work with pyarrow backend?
     conj = Map.register(lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df)))
     convert_dtypes = Fold.register(pandas.DataFrame.convert_dtypes)
     invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy")
     isna = Map.register(pandas.DataFrame.isna, dtypes="bool")
+    # better way to distinguish methods for NumPy API?
     _isfinite = Map.register(
         lambda df, *args, **kwargs: pandas.DataFrame(np.isfinite(df, *args, **kwargs)),
-        dtypes=np.bool_,
+        dtypes="bool",
     )
     _isinf = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.isinf(df, *args, **kwargs)),
-        dtypes=np.bool_,
+        dtypes="bool",
     )
     _isnat = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.isnat(df, *args, **kwargs)),
-        dtypes=np.bool_,
+        dtypes="bool",
     )
     _isneginf = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.isneginf(df, *args, **kwargs)),
-        dtypes=np.bool_,
+        dtypes="bool",
     )
     _isposinf = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.isposinf(df, *args, **kwargs)),
-        dtypes=np.bool_,
+        dtypes="bool",
     )
     _iscomplex = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.iscomplex(df, *args, **kwargs)),
-        dtypes=np.bool_,
+        dtypes="bool",
     )
     _isreal = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.isreal(df, *args, **kwargs)),
-        dtypes=np.bool_,
+        dtypes="bool",
     )
-    _logical_not = Map.register(np.logical_not, dtypes=np.bool_)  # Needed for numpy API
+    _logical_not = Map.register(np.logical_not, dtypes="bool")  # Needed for numpy API
     _tanh = Map.register(
         lambda df, *args, **kwargs: pandas.DataFrame(np.tanh(df, *args, **kwargs))
     )  # Needed for numpy API
@@ -2122,6 +2134,7 @@ def astype(self, col_dtypes, errors: str = "raise"):
         # other query compilers may not take care of error handling at the API
         # layer. This query compiler assumes there won't be any errors due to
         # invalid type keys.
+        # Function that can change the backend
         return self.__constructor__(
             self._modin_frame.astype(col_dtypes, errors=errors),
             shape_hint=self._shape_hint,
@@ -2280,6 +2293,7 @@ def map_func(df):  # pragma: no cover
             """Compute covariance or correlation matrix for the passed frame."""
             df = df.to_numpy()
             n_rows = df.shape[0]
+            # Does it work with pyarrow backend?
             df_mask = np.isfinite(df)
 
             result = np.empty((n_rows, n_cols), dtype="float64")
@@ -2604,7 +2618,11 @@ def quantile_for_list_of_values(self, **kwargs):
             new_columns = [
                 col
                 for col, dtype in zip(self.columns, self.dtypes)
-                if (is_numeric_dtype(dtype) or lib.is_np_dtype(dtype, "mM"))
+                if (
+                    is_numeric_dtype(dtype)
+                    or is_timedelta64_dtype(dtype)
+                    or is_datetime64_any_dtype(dtype)
+                )
             ]
         if axis == 1:
             query_compiler = self.getitem_column_array(new_columns)
@@ -2799,13 +2817,14 @@ def applyier(df, internal_indices, other=[], internal_other_indices=[]):
 
     # __getitem__ methods
     __getitem_bool = Binary.register(
+        # TODO: `is_scalar` don't work with pyarrow scalars
         lambda df, r: df[[r]] if is_scalar(r) else df[r],
         join_type="left",
         labels="drop",
     )
 
     # __setitem__ methods
-    def setitem_bool(self, row_loc, col_loc, item):
+    def setitem_bool(self, row_loc: PandasQueryCompiler, col_loc, item):
         def _set_item(df, row_loc):  # pragma: no cover
             df = df.copy()
             df.loc[row_loc.squeeze(axis=1), col_loc] = item
@@ -2814,18 +2833,7 @@ def _set_item(df, row_loc):  # pragma: no cover
         if self._modin_frame.has_materialized_dtypes and is_scalar(item):
             new_dtypes = self.dtypes.copy()
             old_dtypes = new_dtypes[col_loc]
-
-            if hasattr(item, "dtype"):
-                # If we're dealing with a numpy scalar (np.int, np.datetime64, ...)
-                # we would like to get its internal dtype
-                item_type = item.dtype
-            elif hasattr(item, "to_numpy"):
-                # If we're dealing with a scalar that can be converted to numpy (for example pandas.Timestamp)
-                # we would like to convert it and get its proper internal dtype
-                item_type = item.to_numpy().dtype
-            else:
-                item_type = pandas.api.types.pandas_dtype(type(item))
-
+            item_type = extract_dtype(item)
             if isinstance(old_dtypes, pandas.Series):
                 new_dtypes[col_loc] = [
                     find_common_type([dtype, item_type]) for dtype in old_dtypes.values
@@ -2893,7 +2901,9 @@ def getitem_array(self, key):
                 )
             return self.getitem_column_array(key)
 
-    def getitem_column_array(self, key, numeric=False, ignore_order=False):
+    def getitem_column_array(
+        self, key, numeric=False, ignore_order=False
+    ) -> PandasQueryCompiler:
         shape_hint = "column" if len(key) == 1 else None
         if numeric:
             if ignore_order and is_list_like(key):
@@ -3053,6 +3063,7 @@ def mapper(df: pandas.DataFrame):
                     )
                     # we have to keep other columns so setting their mask
                     # values with `False`
+                    # TODO: pyarrow backend?
                     mask = pandas.Series(
                         np.zeros(df.shape[1], dtype=bool), index=df.columns
                     )
@@ -3105,7 +3116,9 @@ def reduce(df: pandas.DataFrame, mask: pandas.DataFrame):
             shape_hint=self._shape_hint,
         )
 
-    def drop(self, index=None, columns=None, errors: str = "raise"):
+    def drop(
+        self, index=None, columns=None, errors: str = "raise"
+    ) -> PandasQueryCompiler:
         # `errors` parameter needs to be part of the function signature because
         # other query compilers may not take care of error handling at the API
         # layer. This query compiler assumes there won't be any errors due to
@@ -3152,7 +3165,8 @@ def _compute_duplicated(df):  # pragma: no cover
             hashed_modin_frame = self._modin_frame.reduce(
                 axis=1,
                 function=_compute_hash,
-                dtypes=pandas.api.types.pandas_dtype("O"),
+                # TODO: pyarrow backend
+                dtypes="object",
             )
         else:
             hashed_modin_frame = self._modin_frame
@@ -3906,7 +3920,7 @@ def agg_func(grp, *args, **kwargs):
             add_missing_cats=add_missing_cats,
             **groupby_kwargs,
         )
-        result_qc = self.__constructor__(result)
+        result_qc: PandasQueryCompiler = self.__constructor__(result)
 
         if not is_transform and not groupby_kwargs.get("as_index", True):
             return result_qc.reset_index(drop=True)
@@ -4440,14 +4454,15 @@ def map_fn(df):  # pragma: no cover
         # efficient if we are mapping over all of the data to do it this way
         # than it would be to reuse the code for specific columns.
         if len(columns) == len(self.columns):
+            # TODO: pyarrow backend
             new_modin_frame = self._modin_frame.apply_full_axis(
-                0, map_fn, new_index=self.index, dtypes=bool
+                0, map_fn, new_index=self.index, dtypes="bool"
             )
             untouched_frame = None
         else:
             new_modin_frame = self._modin_frame.take_2d_labels_or_positional(
                 col_labels=columns
-            ).apply_full_axis(0, map_fn, new_index=self.index, dtypes=bool)
+            ).apply_full_axis(0, map_fn, new_index=self.index, dtypes="bool")
             untouched_frame = self.drop(columns=columns)
         # If we mapped over all the data we are done. If not, we need to
         # prepend the `new_modin_frame` with the raw data from the columns that were
@@ -4496,10 +4511,11 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item):
             pandas.DataFrame
                 Partition data with updated values.
             """
-            partition = partition.copy()
             try:
                 partition.iloc[row_internal_indices, col_internal_indices] = item
             except ValueError:
+                # maybe make a copy only if there is an exception?
+                partition = partition.copy()
                 # `copy` is needed to avoid "ValueError: buffer source array is read-only" for `item`
                 # because the item may be converted to the type that is in the dataframe.
                 # TODO: in the future we will need to convert to the correct type manually according
@@ -4519,7 +4535,7 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item):
             )
         else:
             broadcasted_item, broadcasted_dtypes = item, pandas.Series(
-                [np.array(item).dtype] * len(col_numeric_index)
+                [extract_dtype(item)] * len(col_numeric_index)
             )
 
         new_dtypes = None
@@ -4572,7 +4588,10 @@ def sort_columns_by_row_values(self, rows, ascending=True, **kwargs):
     def cat_codes(self):
         def func(df: pandas.DataFrame) -> pandas.DataFrame:
             ser = df.iloc[:, 0]
-            assert isinstance(ser.dtype, pandas.CategoricalDtype)
+            if not isinstance(ser.dtype, pandas.CategoricalDtype):
+                raise TypeError(
+                    f"Series dtype should be `CategoricalDtype`: actual dtype: {ser.dtype}"
+                )
             return ser.cat.codes.to_frame(name=MODIN_UNNAMED_SERIES_LABEL)
 
         res = self._modin_frame.map(func=func, new_columns=[MODIN_UNNAMED_SERIES_LABEL])
diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py
index 03a9078b0ee..3a9ad6aa13d 100644
--- a/modin/pandas/utils.py
+++ b/modin/pandas/utils.py
@@ -297,6 +297,7 @@ def broadcast_item(
     try:
         # Cast to numpy drop information about heterogeneous types (cast to common)
         # TODO: we shouldn't do that, maybe there should be the if branch
+        # TODO: what if item comes from pyarrow
         item = np.array(item)
         if dtypes is None:
             dtypes = pandas.Series([item.dtype] * len(col_lookup))

From 91f2607a94d305f891273b177bc218d2d8d7f8a9 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 3 May 2024 18:23:50 +0200
Subject: [PATCH 17/50] some more places

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../dataframe/pandas/dataframe/dataframe.py   | 108 ++++++++++--------
 .../core/dataframe/pandas/metadata/dtypes.py  |  54 ++++-----
 .../storage_formats/pandas/aggregations.py    |   2 +
 modin/pandas/dataframe.py                     |   5 +
 modin/tests/pandas/dataframe/test_reduce.py   |   1 +
 5 files changed, 95 insertions(+), 75 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 4a4bb8906d9..78d297b11b0 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -56,6 +56,9 @@
     from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import (
         ProtocolDataframe,
     )
+    from modin.core.dataframe.pandas.partitioning.partition_manager import (
+        PandasDataframePartitionManager,
+    )
     from pandas._typing import npt
 
 from modin.logging import ClassLogger
@@ -95,7 +98,7 @@ class PandasDataframe(
         The data types for the dataframe columns.
     """
 
-    _partition_mgr_cls = None
+    _partition_mgr_cls: PandasDataframePartitionManager = None
     _query_compiler_cls = PandasQueryCompiler
     # These properties flag whether or not we are deferring the metadata synchronization
     _deferred_index = False
@@ -122,7 +125,7 @@ def __init__(
         columns=None,
         row_lengths=None,
         column_widths=None,
-        dtypes=None,
+        dtypes: Optional[Union[pandas.Series, ModinDtypes, Callable]] = None,
     ):
         self._partitions = partitions
         self.set_index_cache(index)
@@ -412,13 +415,13 @@ def get_dtypes_set(self):
             return self._dtypes.get_dtypes_set()
         return set(self.dtypes.values)
 
-    def _compute_dtypes(self, columns=None):
+    def _compute_dtypes(self, columns=None) -> pandas.Series:
         """
         Compute the data types via TreeReduce pattern for the specified columns.
 
         Parameters
         ----------
-        columns : list-like, default: None
+        columns : list-like, optional
             Columns to compute dtypes for. If not specified compute dtypes
             for all the columns in the dataframe.
 
@@ -862,7 +865,7 @@ def synchronize_labels(self, axis=None):
 
         Parameters
         ----------
-        axis : int, default: None
+        axis : int, optional
             The deferred axis.
             0 for the index, 1 for the columns.
         """
@@ -883,7 +886,7 @@ def _propagate_index_objs(self, axis=None):
 
         Parameters
         ----------
-        axis : int, default: None
+        axis : int, optional
             The axis to apply to. If it's None applies to both axes.
         """
         self._filter_empties(compute_metadata=False)
@@ -987,7 +990,7 @@ def take_2d_labels_or_positional(
         row_positions: Optional[List[int]] = None,
         col_labels: Optional[List[Hashable]] = None,
         col_positions: Optional[List[int]] = None,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Lazily select columns or rows from given indices.
 
@@ -1143,7 +1146,7 @@ def _take_2d_positional(
         self,
         row_positions: Optional[List[int]] = None,
         col_positions: Optional[List[int]] = None,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Lazily select columns or rows from given indices.
 
@@ -1323,10 +1326,10 @@ def _take_2d_positional(
 
     def _maybe_reorder_labels(
         self,
-        intermediate: "PandasDataframe",
+        intermediate: PandasDataframe,
         row_positions,
         col_positions,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Call re-order labels on take_2d_labels_or_positional result if necessary.
 
@@ -1390,7 +1393,7 @@ def _maybe_reorder_labels(
         )
 
     @lazy_metadata_decorator(apply_axis="rows")
-    def from_labels(self) -> "PandasDataframe":
+    def from_labels(self) -> PandasDataframe:
         """
         Convert the row labels to a column of data, inserted at the first position.
 
@@ -1492,7 +1495,7 @@ def from_labels_executor(df, **kwargs):
         result.synchronize_labels(axis=0)
         return result
 
-    def to_labels(self, column_list: List[Hashable]) -> "PandasDataframe":
+    def to_labels(self, column_list: List[Hashable]) -> PandasDataframe:
         """
         Move one or more columns into the row labels. Previous labels are dropped.
 
@@ -1665,6 +1668,7 @@ def astype(self, col_dtypes, errors: str = "raise"):
                     if new_dtypes is None:
                         new_dtypes = self_dtypes.copy()
                     # Update the new dtype series to the proper pandas dtype
+                    # TODO: pyarrow backend?
                     new_dtype = pandas.api.types.pandas_dtype(dtype)
                     if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"):
                         # FIXME: https://github.com/dask/distributed/issues/8585
@@ -1694,6 +1698,7 @@ def astype_builder(df):
             # Assume that the dtype is a scalar.
             if not (col_dtypes == self_dtypes).all():
                 new_dtypes = self_dtypes.copy()
+                # TODO: pyarrow backend?
                 new_dtype = pandas.api.types.pandas_dtype(col_dtypes)
                 if Engine.get() == "Dask" and hasattr(new_dtype, "_is_materialized"):
                     # FIXME: https://github.com/dask/distributed/issues/8585
@@ -1935,7 +1940,7 @@ def _join_index_objects(axis, indexes, how, sort, fill_value=None):
             considered to be the first index in the `indexes` list.
         sort : boolean
             Whether or not to sort the joined index.
-        fill_value : any, default: None
+        fill_value : any, optional
             Value to use for missing values.
 
         Returns
@@ -2084,6 +2089,7 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None):
         if dtypes == "copy":
             dtypes = self.copy_dtypes_cache()
         elif dtypes is not None:
+            # TODO: pyarrow backend?
             dtypes = pandas.Series(
                 [pandas.api.types.pandas_dtype(dtypes)] * len(new_axes[1]),
                 index=new_axes[1],
@@ -2103,7 +2109,7 @@ def reduce(
         axis: Union[int, Axis],
         function: Callable,
         dtypes: Optional[str] = None,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Perform a user-defined aggregation on the specified axis, where the axis reduces down to a singleton. Requires knowledge of the full axis for the reduction.
 
@@ -2141,7 +2147,7 @@ def tree_reduce(
         map_func: Callable,
         reduce_func: Optional[Callable] = None,
         dtypes: Optional[str] = None,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Apply function that will reduce the data to a pandas Series.
 
@@ -2188,7 +2194,7 @@ def map(
         func_args=None,
         func_kwargs=None,
         lazy=False,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Perform a function that maps across the entire dataset.
 
@@ -2253,7 +2259,7 @@ def window(
         reduce_fn: Callable,
         window_size: int,
         result_schema: Optional[Dict[Hashable, type]] = None,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Apply a sliding window operator that acts as a GROUPBY on each window, and reduces down to a single row (column) per window.
 
@@ -2326,7 +2332,7 @@ def fold(self, axis, func, new_columns=None):
             self._column_widths_cache,
         )
 
-    def infer_objects(self) -> "PandasDataframe":
+    def infer_objects(self) -> PandasDataframe:
         """
         Attempt to infer better dtypes for object columns.
 
@@ -2344,7 +2350,7 @@ def infer_objects(self) -> "PandasDataframe":
         ]
         return self.infer_types(obj_cols)
 
-    def infer_types(self, col_labels: List[str]) -> "PandasDataframe":
+    def infer_types(self, col_labels: List[str]) -> PandasDataframe:
         """
         Determine the compatible type shared by all values in the specified columns, and coerce them to that type.
 
@@ -2378,7 +2384,7 @@ def join(
         condition: Callable,
         other: ModinDataframe,
         join_type: Union[str, JoinType],
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Join this dataframe with the other.
 
@@ -2414,7 +2420,7 @@ def rename(
         self,
         new_row_labels: Optional[Union[Dict[Hashable, Hashable], Callable]] = None,
         new_col_labels: Optional[Union[Dict[Hashable, Hashable], Callable]] = None,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Replace the row and column labels with the specified new labels.
 
@@ -2670,7 +2676,7 @@ def sort_by(
         columns: Union[str, List[str]],
         ascending: bool = True,
         **kwargs,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Logically reorder rows (columns if axis=1) lexicographically by the data in a column or set of columns.
 
@@ -2738,7 +2744,7 @@ def sort_function(df):  # pragma: no cover
         return result
 
     @lazy_metadata_decorator(apply_axis="both")
-    def filter(self, axis: Union[Axis, int], condition: Callable) -> "PandasDataframe":
+    def filter(self, axis: Union[Axis, int], condition: Callable) -> PandasDataframe:
         """
         Filter data based on the function provided along an entire axis.
 
@@ -2780,7 +2786,7 @@ def filter(self, axis: Union[Axis, int], condition: Callable) -> "PandasDatafram
             self.copy_dtypes_cache() if axis == Axis.COL_WISE else None,
         )
 
-    def filter_by_types(self, types: List[Hashable]) -> "PandasDataframe":
+    def filter_by_types(self, types: List[Hashable]) -> PandasDataframe:
         """
         Allow the user to specify a type or set of types by which to filter the columns.
 
@@ -2799,7 +2805,7 @@ def filter_by_types(self, types: List[Hashable]) -> "PandasDataframe":
         )
 
     @lazy_metadata_decorator(apply_axis="both")
-    def explode(self, axis: Union[int, Axis], func: Callable) -> "PandasDataframe":
+    def explode(self, axis: Union[int, Axis], func: Callable) -> PandasDataframe:
         """
         Explode list-like entries along an entire axis.
 
@@ -2834,7 +2840,7 @@ def explode(self, axis: Union[int, Axis], func: Callable) -> "PandasDataframe":
             partitions, new_index, new_columns, row_lengths, column_widths
         )
 
-    def combine(self) -> "PandasDataframe":
+    def combine(self) -> PandasDataframe:
         """
         Create a single partition PandasDataframe from the partitions of the current dataframe.
 
@@ -2893,7 +2899,7 @@ def apply_full_axis(
         new_columns : list-like, optional
             The columns of the result. We may know this in
             advance, and if not provided it must be computed.
-        apply_indices : list-like, default: None
+        apply_indices : list-like, optional
             Indices of `axis ^ 1` to apply function over.
         enumerate_partitions : bool, default: False
             Whether pass partition index into applied `func` or not.
@@ -2963,9 +2969,9 @@ def apply_full_axis_select_indices(
             The axis to apply over.
         func : callable
             The function to apply.
-        apply_indices : list-like, default: None
+        apply_indices : list-like, optional
             The labels to apply over.
-        numeric_indices : list-like, default: None
+        numeric_indices : list-like, optional
             The indices to apply over.
         new_index : list-like, optional
             The index of the result. We may know this in advance,
@@ -3034,12 +3040,12 @@ def apply_select_indices(
             The axis to apply over.
         func : callable
             The function to apply.
-        apply_indices : list-like, default: None
+        apply_indices : list-like, optional
             The labels to apply over. Must be given if axis is provided.
-        row_labels : list-like, default: None
+        row_labels : list-like, optional
             The row labels to apply over. Must be provided with
             `col_labels` to apply over both axes.
-        col_labels : list-like, default: None
+        col_labels : list-like, optional
             The column labels to apply over. Must be provided
             with `row_labels` to apply over both axes.
         new_index : list-like, optional
@@ -3166,7 +3172,7 @@ def broadcast_apply(
         labels : {"keep", "replace", "drop"}, default: "keep"
             Whether keep labels from `self` Modin DataFrame, replace them with labels
             from joined DataFrame or drop altogether to make them be computed lazily later.
-        dtypes : "copy", pandas.Series or None, default: None
+        dtypes : "copy", pandas.Series or None, optional
             Dtypes of the result. "copy" to keep old dtypes and None to compute them on demand.
 
         Returns
@@ -3318,9 +3324,9 @@ def broadcast_apply_select_indices(
             Function to apply.
         other : PandasDataframe
             Partitions of which should be broadcasted.
-        apply_indices : list, default: None
+        apply_indices : list, optional
             List of labels to apply (if `numeric_indices` are not specified).
-        numeric_indices : list, default: None
+        numeric_indices : list, optional
             Numeric indices to apply (if `apply_indices` are not specified).
         keep_remaining : bool, default: False
             Whether drop the data that is not computed over or not.
@@ -3488,6 +3494,8 @@ def broadcast_apply_full_axis(
                 kw["dtypes"] = dtypes.copy()
             else:
                 if new_columns is None:
+                    assert not is_list_like(dtypes)
+                    # need something like this utility: construct_dtype()
                     kw["dtypes"] = ModinDtypes(
                         DtypesDescriptor(
                             # TODO: pyarrow backend
@@ -3572,7 +3580,7 @@ def broadcast_apply_full_axis(
             result.synchronize_labels(axis=1)
         return result
 
-    def _check_if_axes_identical(self, other: "PandasDataframe", axis: int = 0) -> bool:
+    def _check_if_axes_identical(self, other: PandasDataframe, axis: int = 0) -> bool:
         """
         Check whether indices/partitioning along the specified `axis` are identical when compared with `other`.
 
@@ -3626,7 +3634,7 @@ def _copartition(
             this method will skip repartitioning if it is possible. This is because
             reindexing is extremely inefficient. Because this method is used to
             `join` or `append`, it is vital that the internal indices match.
-        fill_value : any, default: None
+        fill_value : any, optional
             Value to use for missing values.
 
         Returns
@@ -3748,7 +3756,7 @@ def n_ary_op(
         join_type="outer",
         copartition_along_columns=True,
         labels="replace",
-        dtypes=None,
+        dtypes: Optional[pandas.Series] = None,
     ):
         """
         Perform an n-opary operation by joining with other Modin DataFrame(s).
@@ -3767,7 +3775,7 @@ def n_ary_op(
         labels : {"replace", "drop"}, default: "replace"
             Whether use labels from joined DataFrame or drop altogether to make
             them be computed lazily later.
-        dtypes : series, default: None
+        dtypes : pandas.Series, optional
             Dtypes of the resultant dataframe, this argument will be
             received if the resultant dtypes of n-opary operation is precomputed.
 
@@ -3837,10 +3845,10 @@ def n_ary_op(
     def concat(
         self,
         axis: Union[int, Axis],
-        others: Union["PandasDataframe", List["PandasDataframe"]],
+        others: Union[PandasDataframe, List[PandasDataframe]],
         how,
         sort,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Concatenate `self` with one or more other Modin DataFrames.
 
@@ -3918,6 +3926,7 @@ def _compute_new_widths():
                 new_index = self.index.append([other.index for other in others])
             new_columns = joined_index
             frames = [self] + others
+            # TODO: should we wrap all `concat` call into "try except" block?
             new_dtypes = ModinDtypes.concat([frame._dtypes for frame in frames], axis=1)
             # If we have already cached the length of each row in at least one
             # of the row's partitions, we can build new_lengths for the new
@@ -3961,7 +3970,13 @@ def _compute_new_widths():
         )
 
     def _apply_func_to_range_partitioning_broadcast(
-        self, right, func, key, new_index=None, new_columns=None, new_dtypes=None
+        self,
+        right,
+        func,
+        key,
+        new_index=None,
+        new_columns=None,
+        new_dtypes: Optional[Union[ModinDtypes, pandas.Series]] = None,
     ):
         """
         Apply `func` against two dataframes using range-partitioning implementation.
@@ -4033,7 +4048,7 @@ def groupby(
         self,
         axis: Union[int, Axis],
         internal_by: List[str],
-        external_by: List["PandasDataframe"],
+        external_by: List[PandasDataframe],
         by_positions: List[int],
         operator: Callable,
         result_schema: Optional[Dict[Hashable, type]] = None,
@@ -4041,7 +4056,7 @@ def groupby(
         series_groupby: bool = False,
         add_missing_cats: bool = False,
         **kwargs: dict,
-    ) -> "PandasDataframe":
+    ) -> PandasDataframe:
         """
         Generate groups based on values in the input column(s) and perform the specified operation on each.
 
@@ -4424,7 +4439,7 @@ def groupby_reduce(
         new_columns : pandas.Index, optional
             Columns of the result. We may know this in advance,
             and if not provided it must be computed.
-        apply_indices : list-like, default: None
+        apply_indices : list-like, optional
             Indices of `axis ^ 1` to apply groupby over.
 
         Returns
@@ -4534,6 +4549,7 @@ def _arrow_type_to_dtype(cls, arrow_type):
         import pyarrow
 
         try:
+            # TODO: should we map arrow types to pyarrow-backed pandas types?
             res = arrow_type.to_pandas_dtype()
         # Conversion to pandas is not implemented for some arrow types,
         # perform manual conversion for them:
@@ -4692,7 +4708,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
         )
 
     @classmethod
-    def from_dataframe(cls, df: "ProtocolDataframe") -> "PandasDataframe":
+    def from_dataframe(cls, df: "ProtocolDataframe") -> PandasDataframe:
         """
         Convert a DataFrame implementing the dataframe exchange protocol to a Core Modin Dataframe.
 
diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py
index ec227cde32f..c7979704db2 100644
--- a/modin/core/dataframe/pandas/metadata/dtypes.py
+++ b/modin/core/dataframe/pandas/metadata/dtypes.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 import pandas
-from pandas._typing import IndexLabel
+from pandas._typing import DtypeObj, IndexLabel
 from pandas.core.dtypes.cast import find_common_type
 
 if TYPE_CHECKING:
@@ -35,13 +35,13 @@ class DtypesDescriptor:
 
     Parameters
     ----------
-    known_dtypes : dict[IndexLabel, np.dtype] or pandas.Series, optional
+    known_dtypes : dict[IndexLabel, DtypeObj] or pandas.Series, optional
         Columns that we know dtypes for.
     cols_with_unknown_dtypes : list[IndexLabel], optional
         Column names that have unknown dtypes. If specified together with `remaining_dtype`, must describe all
         columns with unknown dtypes, otherwise, the missing columns will be assigned to `remaining_dtype`.
         If `cols_with_unknown_dtypes` is incomplete, you must specify `know_all_names=False`.
-    remaining_dtype : np.dtype, optional
+    remaining_dtype : DtypeObj, optional
         Dtype for columns that are not present neither in `known_dtypes` nor in `cols_with_unknown_dtypes`.
         This parameter is intended to describe columns that we known dtypes for, but don't know their
         names yet. Note, that this parameter DOESN'T describe dtypes for columns from `cols_with_unknown_dtypes`.
@@ -62,11 +62,10 @@ class DtypesDescriptor:
 
     def __init__(
         self,
-        known_dtypes: Optional[Union[dict[IndexLabel, np.dtype], pandas.Series]] = None,
+        known_dtypes: Optional[Union[dict[IndexLabel, DtypeObj], pandas.Series]] = None,
         cols_with_unknown_dtypes: Optional[list[IndexLabel]] = None,
-        # TODO: what if there is a type of another backend
-        remaining_dtype: Optional[np.dtype] = None,
-        parent_df: Optional["PandasDataframe"] = None,
+        remaining_dtype: Optional[DtypeObj] = None,
+        parent_df: Optional[PandasDataframe] = None,
         columns_order: Optional[dict[int, IndexLabel]] = None,
         know_all_names: bool = True,
         _schema_is_known: Optional[bool] = None,
@@ -76,7 +75,7 @@ def __init__(
                 "It's not allowed to pass 'remaining_dtype' and 'know_all_names=False' at the same time."
             )
         # columns with known dtypes
-        self._known_dtypes: dict[IndexLabel, np.dtype] = (
+        self._known_dtypes: dict[IndexLabel, DtypeObj] = (
             {} if known_dtypes is None else dict(known_dtypes)
         )
         if known_dtypes is not None and len(self._known_dtypes) != len(known_dtypes):
@@ -109,7 +108,7 @@ def __init__(
 
         self._know_all_names: bool = know_all_names
         # a common dtype for columns that are not present in 'known_dtypes' nor in 'cols_with_unknown_dtypes'
-        self._remaining_dtype: Optional[np.dtype] = remaining_dtype
+        self._remaining_dtype: Optional[DtypeObj] = remaining_dtype
         self._parent_df: Optional["PandasDataframe"] = parent_df
         if columns_order is None:
             self._columns_order: Optional[dict[int, IndexLabel]] = None
@@ -135,7 +134,7 @@ def __init__(
                 )
             self._columns_order: Optional[dict[int, IndexLabel]] = columns_order
 
-    def update_parent(self, new_parent: "PandasDataframe"):
+    def update_parent(self, new_parent: PandasDataframe):
         """
         Set new parent dataframe.
 
@@ -205,7 +204,7 @@ def __str__(self):  # noqa: GL08
 
     def lazy_get(
         self, ids: list[Union[IndexLabel, int]], numeric_index: bool = False
-    ) -> "DtypesDescriptor":
+    ) -> DtypesDescriptor:
         """
         Get dtypes descriptor for a subset of columns without triggering any computations.
 
@@ -258,7 +257,7 @@ def lazy_get(
             columns_order=columns_order,
         )
 
-    def copy(self) -> "DtypesDescriptor":
+    def copy(self) -> DtypesDescriptor:
         """
         Get a copy of this descriptor.
 
@@ -280,9 +279,7 @@ def copy(self) -> "DtypesDescriptor":
             _schema_is_known=self._schema_is_known,
         )
 
-    def set_index(
-        self, new_index: Union[pandas.Index, "ModinIndex"]
-    ) -> "DtypesDescriptor":
+    def set_index(self, new_index: Union[pandas.Index, ModinIndex]) -> DtypesDescriptor:
         """
         Set new column names for this descriptor.
 
@@ -327,7 +324,7 @@ def set_index(
         }
         return new_self
 
-    def equals(self, other: "DtypesDescriptor") -> bool:
+    def equals(self, other: DtypesDescriptor) -> bool:
         """
         Compare two descriptors for equality.
 
@@ -444,25 +441,25 @@ def to_series(self) -> pandas.Series:
         self.materialize()
         return pandas.Series(self._known_dtypes)
 
-    def get_dtypes_set(self) -> set[np.dtype]:
+    def get_dtypes_set(self) -> set[DtypeObj]:
         """
         Get a set of dtypes from the descriptor.
 
         Returns
         -------
-        set[np.dtype]
+        set[DtypeObj]
         """
         if len(self._cols_with_unknown_dtypes) > 0 or not self._know_all_names:
             self._materialize_cols_with_unknown_dtypes()
-        known_dtypes: set[np.dtype] = set(self._known_dtypes.values())
+        known_dtypes: set[DtypeObj] = set(self._known_dtypes.values())
         if self._remaining_dtype is not None:
             known_dtypes.add(self._remaining_dtype)
         return known_dtypes
 
     @classmethod
     def _merge_dtypes(
-        cls, values: list[Union["DtypesDescriptor", pandas.Series, None]]
-    ) -> "DtypesDescriptor":
+        cls, values: list[Union[DtypesDescriptor, pandas.Series, None]]
+    ) -> DtypesDescriptor:
         """
         Union columns described by ``values`` and compute common dtypes for them.
 
@@ -555,8 +552,8 @@ def combine_dtypes(row):
 
     @classmethod
     def concat(
-        cls, values: list[Union["DtypesDescriptor", pandas.Series, None]], axis: int = 0
-    ) -> "DtypesDescriptor":
+        cls, values: list[Union[DtypesDescriptor, pandas.Series, None]], axis: int = 0
+    ) -> DtypesDescriptor:
         """
         Concatenate dtypes descriptors into a single descriptor.
 
@@ -780,14 +777,13 @@ def is_materialized(self) -> bool:
         """
         return isinstance(self._value, pandas.Series)
 
-    # TODO: pyarrow backend
-    def get_dtypes_set(self) -> set[np.dtype]:
+    def get_dtypes_set(self) -> set[DtypeObj]:
         """
         Get a set of dtypes from the descriptor.
 
         Returns
         -------
-        set[np.dtype]
+        set[DtypeObj]
         """
         if isinstance(self._value, DtypesDescriptor):
             return self._value.get_dtypes_set()
@@ -1201,7 +1197,7 @@ def _materialize_categories(self):
 
 def get_categories_dtype(
     cdt: Union[LazyProxyCategoricalDtype, pandas.CategoricalDtype]
-):
+) -> DtypeObj:
     """
     Get the categories dtype.
 
@@ -1220,7 +1216,7 @@ def get_categories_dtype(
     )
 
 
-def extract_dtype(value):
+def extract_dtype(value) -> Union[DtypeObj, pandas.Series]:
     """
     Extract dtype(s) from the passed `value`.
 
@@ -1230,7 +1226,7 @@ def extract_dtype(value):
 
     Returns
     -------
-    numpy.dtype or pandas.Series of numpy.dtypes
+    DtypeObj or pandas.Series of DtypeObj
     """
     from modin.pandas.utils import is_scalar
 
diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py
index 454b75c442b..7d5293b1017 100644
--- a/modin/core/storage_formats/pandas/aggregations.py
+++ b/modin/core/storage_formats/pandas/aggregations.py
@@ -65,6 +65,7 @@ def corr_method(
                     qc._modin_frame.copy_columns_cache(),
                 )
                 new_dtypes = pandas.Series(
+                    # TODO: pyarrow backend?
                     np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
                     index=new_columns,
                 )
@@ -74,6 +75,7 @@ def corr_method(
                 new_columns = old_dtypes[old_dtypes.map(is_numeric_dtype)].index
                 new_index = new_columns.copy()
                 new_dtypes = pandas.Series(
+                    # TODO: pyarrow backend?
                     np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
                     index=new_columns,
                 )
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index 320ddad7f4e..df0c2b9f436 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -1627,6 +1627,7 @@ def prod(
             return Series(
                 [np.nan] * len(new_index),
                 index=new_index,
+                # TODO: pyarrow backend?
                 dtype=pandas.api.types.pandas_dtype("object"),
             )
 
@@ -2152,6 +2153,7 @@ def sum(
             return Series(
                 [np.nan] * len(new_index),
                 index=new_index,
+                # TODO: pyarrow backend?
                 dtype=pandas.api.types.pandas_dtype("object"),
             )
 
@@ -3061,6 +3063,7 @@ def _validate_dtypes_min_max(self, axis, numeric_only) -> DataFrame:
         ):
             # check if there are columns with dtypes datetime or timedelta
             if all(
+                # TODO: pyarrow backend?
                 dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
                 and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
                 for dtype in self.dtypes
@@ -3097,6 +3100,7 @@ def _validate_dtypes_sum_prod_mean(
             not axis
             and numeric_only is False
             and any(
+                # TODO: pyarrow backend?
                 dtype == pandas.api.types.pandas_dtype("datetime64[ns]")
                 for dtype in self.dtypes
             )
@@ -3117,6 +3121,7 @@ def _validate_dtypes_sum_prod_mean(
         ):
             # check if there are columns with dtypes datetime or timedelta
             if all(
+                # TODO: pyarrow backend?
                 dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
                 and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
                 for dtype in self.dtypes
diff --git a/modin/tests/pandas/dataframe/test_reduce.py b/modin/tests/pandas/dataframe/test_reduce.py
index eba5b9341af..dcbdc39e17b 100644
--- a/modin/tests/pandas/dataframe/test_reduce.py
+++ b/modin/tests/pandas/dataframe/test_reduce.py
@@ -327,6 +327,7 @@ def test_sum(data, axis, skipna, is_transposed, request):
 @pytest.mark.parametrize("dtype", ["int64", "Int64"])
 def test_dtype_consistency(dtype):
     # test for issue #6781
+    # TODO: add pyarrow dtype
     res_dtype = pd.DataFrame([1, 2, 3, 4], dtype=dtype).sum().dtype
     assert res_dtype == pandas.api.types.pandas_dtype(dtype)
 

From e24201fc31a4dd6d8fc2981f32557720149dda39 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 3 May 2024 20:49:10 +0200
Subject: [PATCH 18/50] add construct_dtype

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../dataframe/pandas/dataframe/dataframe.py   | 50 +++++++++++--------
 .../pandas/partitioning/partition_manager.py  |  8 ++-
 2 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 78d297b11b0..c0f8dfa28d1 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -101,11 +101,13 @@ class PandasDataframe(
     _partition_mgr_cls: PandasDataframePartitionManager = None
     _query_compiler_cls = PandasQueryCompiler
     # These properties flag whether or not we are deferring the metadata synchronization
-    _deferred_index = False
-    _deferred_column = False
+    _deferred_index: bool = False
+    _deferred_column: bool = False
+
     _index_cache: ModinIndex = None
     _columns_cache: ModinIndex = None
     _dtypes: Optional[ModinDtypes] = None
+    _pandas_backend: str = None
 
     @pandas.util.cache_readonly
     def __constructor__(self) -> Callable[..., PandasDataframe]:
@@ -126,6 +128,7 @@ def __init__(
         row_lengths=None,
         column_widths=None,
         dtypes: Optional[Union[pandas.Series, ModinDtypes, Callable]] = None,
+        pandas_backend: Optional[str] = None,
     ):
         self._partitions = partitions
         self.set_index_cache(index)
@@ -133,6 +136,7 @@ def __init__(
         self._row_lengths_cache = row_lengths
         self._column_widths_cache = column_widths
         self.set_dtypes_cache(dtypes)
+        self._pandas_backend = pandas_backend
 
         self._validate_axes_lengths()
         self._filter_empties(compute_metadata=False)
@@ -1668,7 +1672,7 @@ def astype(self, col_dtypes, errors: str = "raise"):
                     if new_dtypes is None:
                         new_dtypes = self_dtypes.copy()
                     # Update the new dtype series to the proper pandas dtype
-                    # TODO: pyarrow backend?
+                    # TODO: pyarrow backend? We don't need to add an implicit backend for `astype`
                     new_dtype = pandas.api.types.pandas_dtype(dtype)
                     if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"):
                         # FIXME: https://github.com/dask/distributed/issues/8585
@@ -1698,8 +1702,7 @@ def astype_builder(df):
             # Assume that the dtype is a scalar.
             if not (col_dtypes == self_dtypes).all():
                 new_dtypes = self_dtypes.copy()
-                # TODO: pyarrow backend?
-                new_dtype = pandas.api.types.pandas_dtype(col_dtypes)
+                new_dtype = self.construct_dtype(col_dtypes)
                 if Engine.get() == "Dask" and hasattr(new_dtype, "_is_materialized"):
                     # FIXME: https://github.com/dask/distributed/issues/8585
                     _ = new_dtype._materialize_categories()
@@ -2089,9 +2092,8 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None):
         if dtypes == "copy":
             dtypes = self.copy_dtypes_cache()
         elif dtypes is not None:
-            # TODO: pyarrow backend?
             dtypes = pandas.Series(
-                [pandas.api.types.pandas_dtype(dtypes)] * len(new_axes[1]),
+                [self.construct_dtype(dtypes, self._pandas_backend)] * len(new_axes[1]),
                 index=new_axes[1],
             )
 
@@ -2239,9 +2241,8 @@ def map(
             if isinstance(new_columns, ModinIndex):
                 # Materializing lazy columns in order to build dtype's index
                 new_columns = new_columns.get(return_lengths=False)
-            # TODO: consider backend
             dtypes = pandas.Series(
-                [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
+                [self.construct_dtype(dtypes, self._pandas_backend)] * len(new_columns),
                 index=new_columns,
             )
         return self.__constructor__(
@@ -3382,6 +3383,14 @@ def broadcast_apply_select_indices(
             new_partitions, index=new_index, columns=new_columns
         )
 
+    def construct_dtype(dtype: str, backend: Optional[str]):
+        if backend is None:
+            return pandas.api.types.pandas_dtype(dtype)
+        elif backend == "pyarrow":
+            return pandas.api.types.pandas_dtype(f"{dtype}[{backend}]")
+        else:
+            raise NotImplementedError
+
     @lazy_metadata_decorator(apply_axis="both")
     def broadcast_apply_full_axis(
         self,
@@ -3495,20 +3504,15 @@ def broadcast_apply_full_axis(
             else:
                 if new_columns is None:
                     assert not is_list_like(dtypes)
-                    # need something like this utility: construct_dtype()
-                    kw["dtypes"] = ModinDtypes(
-                        DtypesDescriptor(
-                            # TODO: pyarrow backend
-                            remaining_dtype=pandas.api.types.pandas_dtype(dtypes)
-                        )
-                    )
+                    dtype = self.construct_dtype(dtypes, self._pandas_backend)
+                    kw["dtypes"] = ModinDtypes(DtypesDescriptor(remaining_dtype=dtype))
                 else:
                     kw["dtypes"] = (
                         pandas.Series(dtypes, index=new_columns)
                         if is_list_like(dtypes)
                         else pandas.Series(
-                            # TODO: pyarrow backend
-                            [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
+                            [self.construct_dtype(dtypes, self._pandas_backend)]
+                            * len(new_columns),
                             index=new_columns,
                         )
                     )
@@ -4486,8 +4490,8 @@ def from_pandas(cls, df):
         new_index = df.index
         new_columns = df.columns
         new_dtypes = df.dtypes
-        new_frame, new_lengths, new_widths = cls._partition_mgr_cls.from_pandas(
-            df, True
+        new_frame, new_lengths, new_widths, backend = (
+            cls._partition_mgr_cls.from_pandas(df, True)
         )
         return cls(
             new_frame,
@@ -4496,6 +4500,7 @@ def from_pandas(cls, df):
             new_lengths,
             new_widths,
             dtypes=new_dtypes,
+            backend=backend,
         )
 
     @classmethod
@@ -4513,7 +4518,7 @@ def from_arrow(cls, at):
         PandasDataframe
             New Modin DataFrame.
         """
-        new_frame, new_lengths, new_widths = cls._partition_mgr_cls.from_arrow(
+        new_frame, new_lengths, new_widths, backend = cls._partition_mgr_cls.from_arrow(
             at, return_dims=True
         )
         new_columns = Index.__new__(Index, data=at.column_names, dtype="O")
@@ -4529,6 +4534,7 @@ def from_arrow(cls, at):
             row_lengths=new_lengths,
             column_widths=new_widths,
             dtypes=new_dtypes,
+            backend=backend,
         )
 
     @classmethod
@@ -4708,7 +4714,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
         )
 
     @classmethod
-    def from_dataframe(cls, df: "ProtocolDataframe") -> PandasDataframe:
+    def from_dataframe(cls, df: ProtocolDataframe) -> PandasDataframe:
         """
         Convert a DataFrame implementing the dataframe exchange protocol to a Core Modin Dataframe.
 
diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py
index 0da5303fba9..3a6ad333df6 100644
--- a/modin/core/dataframe/pandas/partitioning/partition_manager.py
+++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py
@@ -928,10 +928,13 @@ def update_bar(f):
         parts = cls.split_pandas_df_into_partitions(
             df, row_chunksize, col_chunksize, update_bar
         )
+        backend = None
+        if any(isinstance(x, pandas.ArrowDtype) for x in df.dtypes):
+            backend = "pyarrow"
         if ProgressBar.get():
             pbar.close()
         if not return_dims:
-            return parts
+            return parts, backend
         else:
             row_lengths = [
                 (
@@ -949,7 +952,7 @@ def update_bar(f):
                 )
                 for i in range(0, len(df.columns), col_chunksize)
             ]
-            return parts, row_lengths, col_widths
+            return parts, row_lengths, col_widths, backend
 
     @classmethod
     def from_arrow(cls, at, return_dims=False):
@@ -969,6 +972,7 @@ def from_arrow(cls, at, return_dims=False):
         np.ndarray or (np.ndarray, row_lengths, col_widths)
             A NumPy array with partitions (with dimensions or not).
         """
+        # also return backend
         return cls.from_pandas(at.to_pandas(), return_dims=return_dims)
 
     @classmethod

From 4dba613a0f77a62f7ffad3604e555704d81a265b Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 6 May 2024 12:30:17 +0200
Subject: [PATCH 19/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/pandas/dataframe.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index df0c2b9f436..c56a69726a9 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -1624,11 +1624,14 @@ def prod(
             and min_count > len(axis_to_apply)
         ):
             new_index = self.columns if not axis else self.index
+            # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10)
+            # 0    <NA>
+            # dtype: int64[pyarrow]
             return Series(
                 [np.nan] * len(new_index),
                 index=new_index,
                 # TODO: pyarrow backend?
-                dtype=pandas.api.types.pandas_dtype("object"),
+                dtype=pandas.api.types.pandas_dtype("float64"),
             )
 
         data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
@@ -2154,7 +2157,7 @@ def sum(
                 [np.nan] * len(new_index),
                 index=new_index,
                 # TODO: pyarrow backend?
-                dtype=pandas.api.types.pandas_dtype("object"),
+                dtype=pandas.api.types.pandas_dtype("float64"),
             )
 
         data = self._validate_dtypes_sum_prod_mean(

From ea05389c797a0da2d30c7ec38bc464c145da308e Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 6 May 2024 14:01:19 +0200
Subject: [PATCH 20/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/pandas/dataframe/dataframe.py |  2 +-
 modin/core/storage_formats/base/query_compiler.py  | 14 +++++++++++++-
 modin/pandas/dataframe.py                          |  6 ++++--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index f8e7fb7d91b..0d2e9170eeb 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -3418,7 +3418,7 @@ def broadcast_apply_select_indices(
             new_partitions, index=new_index, columns=new_columns
         )
 
-    def construct_dtype(dtype: str, backend: Optional[str]):
+    def construct_dtype(self, dtype: str, backend: Optional[str]):
         if backend is None:
             return pandas.api.types.pandas_dtype(dtype)
         elif backend == "pyarrow":
diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
index c6e09d00dfb..dbf964ee24a 100644
--- a/modin/core/storage_formats/base/query_compiler.py
+++ b/modin/core/storage_formats/base/query_compiler.py
@@ -21,7 +21,7 @@
 
 import abc
 import warnings
-from typing import Callable, Hashable, List, Optional
+from typing import TYPE_CHECKING, Callable, Hashable, List, Optional
 
 import numpy as np
 import pandas
@@ -52,6 +52,10 @@
 
 from . import doc_utils
 
+if TYPE_CHECKING:
+    # TODO: should be ModinDataframe
+    from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe
+
 
 def _get_axis(axis):
     """
@@ -126,6 +130,8 @@ class BaseQueryCompiler(
     for a list of requirements for subclassing this object.
     """
 
+    _modin_frame: PandasDataframe
+
     def __wrap_in_qc(self, obj):
         """
         Wrap `obj` in query compiler.
@@ -6747,6 +6753,12 @@ def case_when(self, caselist):  # noqa: PR01, RT01, D200
         ]
         return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist)
 
+    def construct_dtype(self, dtype: str, backend: Optional[str]):
+        return self._modin_frame.construct_dtype(dtype, backend)
+
+    def get_backend(self) -> str:
+        return self._modin_frame._pandas_backend
+
     def repartition(self, axis=None):
         """
         Repartitioning QueryCompiler objects to get ideal partitions inside.
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index c56a69726a9..5b0ea87053a 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -1622,6 +1622,8 @@ def prod(
             skipna is not False
             and numeric_only is False
             and min_count > len(axis_to_apply)
+            # Type inference is not so simple for pyarrow
+            and self._query_compiler.get_backend() == "default"
         ):
             new_index = self.columns if not axis else self.index
             # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10)
@@ -1630,7 +1632,6 @@ def prod(
             return Series(
                 [np.nan] * len(new_index),
                 index=new_index,
-                # TODO: pyarrow backend?
                 dtype=pandas.api.types.pandas_dtype("float64"),
             )
 
@@ -2151,12 +2152,13 @@ def sum(
             skipna is not False
             and numeric_only is False
             and min_count > len(axis_to_apply)
+            # Type inference is not so simple for pyarrow
+            and self._query_compiler.get_backend() == "default"
         ):
             new_index = self.columns if not axis else self.index
             return Series(
                 [np.nan] * len(new_index),
                 index=new_index,
-                # TODO: pyarrow backend?
                 dtype=pandas.api.types.pandas_dtype("float64"),
             )
 

From 005f4802c3520e8c1be6c5c5e95a048aa186ec1a Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 6 May 2024 16:02:00 +0200
Subject: [PATCH 21/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/pandas/dataframe/dataframe.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 0d2e9170eeb..6a32741b3cb 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -1702,7 +1702,7 @@ def astype_builder(df):
             # Assume that the dtype is a scalar.
             if not (col_dtypes == self_dtypes).all():
                 new_dtypes = self_dtypes.copy()
-                new_dtype = self.construct_dtype(col_dtypes)
+                new_dtype = self.construct_dtype(col_dtypes, self._pandas_backend)
                 if Engine.get() == "Dask" and hasattr(new_dtype, "_is_materialized"):
                     # FIXME: https://github.com/dask/distributed/issues/8585
                     _ = new_dtype._materialize_categories()
@@ -4525,7 +4525,7 @@ def from_pandas(cls, df):
         new_index = df.index
         new_columns = df.columns
         new_dtypes = df.dtypes
-        new_frame, new_lengths, new_widths, backend = (
+        new_frame, new_lengths, new_widths, pandas_backend = (
             cls._partition_mgr_cls.from_pandas(df, True)
         )
         return cls(
@@ -4535,7 +4535,7 @@ def from_pandas(cls, df):
             new_lengths,
             new_widths,
             dtypes=new_dtypes,
-            backend=backend,
+            pandas_backend=pandas_backend,
         )
 
     @classmethod
@@ -4553,8 +4553,8 @@ def from_arrow(cls, at):
         PandasDataframe
             New Modin DataFrame.
         """
-        new_frame, new_lengths, new_widths, backend = cls._partition_mgr_cls.from_arrow(
-            at, return_dims=True
+        new_frame, new_lengths, new_widths, pandas_backend = (
+            cls._partition_mgr_cls.from_arrow(at, return_dims=True)
         )
         new_columns = Index.__new__(Index, data=at.column_names, dtype="O")
         new_index = Index.__new__(RangeIndex, data=range(at.num_rows))
@@ -4569,7 +4569,7 @@ def from_arrow(cls, at):
             row_lengths=new_lengths,
             column_widths=new_widths,
             dtypes=new_dtypes,
-            backend=backend,
+            pandas_backend=pandas_backend,
         )
 
     @classmethod

From 0d34bea374010c2465642456757c8fc68be45dc4 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 6 May 2024 16:53:24 +0200
Subject: [PATCH 22/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/storage_formats/pandas/query_compiler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index b139ad4ff56..866a2456fae 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -4522,7 +4522,9 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item):
                 # to the following warning. Example: "FutureWarning: Setting an item of incompatible
                 # dtype is deprecated and will raise in a future error of pandas. Value '[1.38629436]'
                 # has dtype incompatible with int64, please explicitly cast to a compatible dtype first."
-                partition.iloc[row_internal_indices, col_internal_indices] = item.copy()
+                partition.iloc[row_internal_indices, col_internal_indices] = (
+                    item.copy() if hasattr(item, "copy") else item
+                )
             return partition
 
         if not is_scalar(item):

From aac70970ab06302af0817081078966a0076914bd Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 6 May 2024 17:02:17 +0200
Subject: [PATCH 23/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/tests/pandas/dataframe/test_binary.py  | 2 ++
 modin/tests/pandas/dataframe/test_default.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py
index 062250bbd8a..a8858cf799a 100644
--- a/modin/tests/pandas/dataframe/test_binary.py
+++ b/modin/tests/pandas/dataframe/test_binary.py
@@ -465,6 +465,8 @@ def test_non_commutative_multiply():
     eval_general(modin_df, pandas_df, lambda s: s * integer)
 
 
+# TODO: just for developing purpose; remove `xfail` mark
+@pytest.mark.xfail
 @pytest.mark.parametrize(
     "op",
     [
diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py
index 0f3ca39fd72..173e90e8762 100644
--- a/modin/tests/pandas/dataframe/test_default.py
+++ b/modin/tests/pandas/dataframe/test_default.py
@@ -528,6 +528,8 @@ def test_info(data, verbose, max_cols, memory_usage, show_counts):
         assert modin_info[1:] == pandas_info[1:]
 
 
+# TODO: just for developing purpose; remove `xfail` mark
+@pytest.mark.xfail
 @pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
 @pytest.mark.parametrize("skipna", [False, True])
 @pytest.mark.parametrize("numeric_only", [False, True])

From 258c3b9a70b294bc113fd80b6a44d770853f9086 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 6 May 2024 21:10:31 +0200
Subject: [PATCH 24/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/algebra/fold.py          | 13 +++-
 .../dataframe/pandas/dataframe/dataframe.py   | 71 ++++++++++++++++---
 .../storage_formats/pandas/aggregations.py    | 12 ++--
 .../storage_formats/pandas/query_compiler.py  | 24 ++++++-
 .../dataframe/pandas/partitions.py            |  4 +-
 modin/logging/logger_decorator.py             | 17 +++--
 modin/pandas/general.py                       |  2 +-
 modin/tests/pandas/dataframe/test_binary.py   | 17 +++--
 8 files changed, 132 insertions(+), 28 deletions(-)

diff --git a/modin/core/dataframe/algebra/fold.py b/modin/core/dataframe/algebra/fold.py
index 419a0b56903..9f6673a3e0a 100644
--- a/modin/core/dataframe/algebra/fold.py
+++ b/modin/core/dataframe/algebra/fold.py
@@ -13,14 +13,21 @@
 
 """Module houses builder class for Fold operator."""
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
 from .operator import Operator
 
+if TYPE_CHECKING:
+    from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
+
 
 class Fold(Operator):
     """Builder class for Fold functions."""
 
     @classmethod
-    def register(cls, fold_function):
+    def register(cls, fold_function) -> Callable[..., PandasQueryCompiler]:
         """
         Build Fold operator that will be performed across rows/columns.
 
@@ -35,7 +42,9 @@ def register(cls, fold_function):
             Function that takes query compiler and executes Fold function.
         """
 
-        def caller(query_compiler, fold_axis=None, *args, **kwargs):
+        def caller(
+            query_compiler: PandasQueryCompiler, fold_axis=None, *args, **kwargs
+        ) -> PandasQueryCompiler:
             """
             Execute Fold function against passed query compiler.
 
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 6a32741b3cb..8f642592856 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -881,7 +881,7 @@ def synchronize_labels(self, axis=None):
         else:
             self._deferred_column = True
 
-    def _propagate_index_objs(self, axis=None):
+    def _propagate_index_objs(self, axis=None) -> None:
         """
         Synchronize labels by applying the index object for specific `axis` to the `self._partitions` lazily.
 
@@ -1320,6 +1320,7 @@ def _take_2d_positional(
             new_row_lengths,
             new_col_widths,
             new_dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
         return self._maybe_reorder_labels(
@@ -1494,6 +1495,7 @@ def from_labels_executor(df, **kwargs):
             row_lengths=self._row_lengths_cache,
             column_widths=new_column_widths,
             dtypes=new_dtypes,
+            pandas_backend=self._pandas_backend,
         )
         # Set flag for propagating deferred row labels across dataframe partitions
         result.synchronize_labels(axis=0)
@@ -1620,7 +1622,13 @@ def _reorder_labels(self, row_positions=None, col_positions=None):
             col_idx = self.copy_columns_cache(copy_lengths=True)
             new_widths = self._column_widths_cache
         return self.__constructor__(
-            ordered_cols, row_idx, col_idx, new_lengths, new_widths, new_dtypes
+            ordered_cols,
+            row_idx,
+            col_idx,
+            new_lengths,
+            new_widths,
+            new_dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
     @lazy_metadata_decorator(apply_axis=None)
@@ -1640,6 +1648,7 @@ def copy(self):
             self._row_lengths_cache,
             self._column_widths_cache,
             self.copy_dtypes_cache(),
+            pandas_backend=self._pandas_backend,
         )
 
     @lazy_metadata_decorator(apply_axis="both")
@@ -1742,6 +1751,7 @@ def astype_builder(df):
             self._row_lengths_cache,
             self._column_widths_cache,
             new_dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
     def numeric_columns(self, include_bool=True):
@@ -2102,6 +2112,7 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None):
             *new_axes,
             *new_axes_lengths,
             dtypes,
+            pandas_backend=self._pandas_backend,
         )
         return result
 
@@ -2287,6 +2298,7 @@ def map(
             self._row_lengths_cache,
             self._column_widths_cache,
             dtypes=dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
     def window(
@@ -2366,6 +2378,7 @@ def fold(self, axis, func, new_columns=None):
             self.copy_columns_cache(copy_lengths=True),
             self._row_lengths_cache,
             self._column_widths_cache,
+            pandas_backend=self._pandas_backend,
         )
 
     def infer_objects(self) -> PandasDataframe:
@@ -2412,6 +2425,7 @@ def infer_types(self, col_labels: List[str]) -> PandasDataframe:
             self._row_lengths_cache,
             self._column_widths_cache,
             new_dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
     def join(
@@ -2517,6 +2531,7 @@ def combine_and_apply(
                 self._row_lengths_cache,
                 [len(self.columns)] if self.has_materialized_columns else None,
                 self.copy_dtypes_cache(),
+                pandas_backend=self._pandas_backend,
             )
         else:
             modin_frame = self
@@ -2820,6 +2835,7 @@ def filter(self, axis: Union[Axis, int], condition: Callable) -> PandasDataframe
             *new_axes,
             *new_lengths,
             self.copy_dtypes_cache() if axis == Axis.COL_WISE else None,
+            pandas_backend=self._pandas_backend,
         )
 
     def filter_by_types(self, types: List[Hashable]) -> PandasDataframe:
@@ -2873,7 +2889,12 @@ def explode(self, axis: Union[int, Axis], func: Callable) -> PandasDataframe:
                 1, partitions
             )
         return self.__constructor__(
-            partitions, new_index, new_columns, row_lengths, column_widths
+            partitions,
+            new_index,
+            new_columns,
+            row_lengths,
+            column_widths,
+            pandas_backend=self._pandas_backend,
         )
 
     def combine(self) -> PandasDataframe:
@@ -2901,6 +2922,7 @@ def combine(self) -> PandasDataframe:
                 else None
             ),
             dtypes=self.copy_dtypes_cache(),
+            pandas_backend=self._pandas_backend,
         )
         result.synchronize_labels()
         return result
@@ -3050,7 +3072,13 @@ def apply_full_axis_select_indices(
         if new_columns is None:
             new_columns = self.columns if axis == 0 else None
         return self.__constructor__(
-            new_partitions, new_index, new_columns, None, None, dtypes=new_dtypes
+            new_partitions,
+            new_index,
+            new_columns,
+            None,
+            None,
+            dtypes=new_dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
     @lazy_metadata_decorator(apply_axis="both")
@@ -3145,6 +3173,7 @@ def apply_select_indices(
                 lengths_objs[0],
                 lengths_objs[1],
                 new_dtypes,
+                pandas_backend=self._pandas_backend,
             )
         else:
             # We are applying over both axes here, so make sure we have all the right
@@ -3172,6 +3201,7 @@ def apply_select_indices(
                 self._row_lengths_cache,
                 self._column_widths_cache,
                 new_dtypes,
+                pandas_backend=self._pandas_backend,
             )
 
     @lazy_metadata_decorator(apply_axis="both")
@@ -3277,6 +3307,7 @@ def _pick_axis(get_axis, sizes_cache):
             new_row_lengths,
             new_column_widths,
             dtypes=dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
     def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all):
@@ -3415,7 +3446,10 @@ def broadcast_apply_select_indices(
             keep_remaining,
         )
         return self.__constructor__(
-            new_partitions, index=new_index, columns=new_columns
+            new_partitions,
+            index=new_index,
+            columns=new_columns,
+            pandas_backend=self._pandas_backend,
         )
 
     def construct_dtype(self, dtype: str, backend: Optional[str]):
@@ -3611,7 +3645,11 @@ def broadcast_apply_full_axis(
                     kw["column_widths"] = self._column_widths_cache
 
         result = self.__constructor__(
-            new_partitions, index=new_index, columns=new_columns, **kw
+            new_partitions,
+            index=new_index,
+            columns=new_columns,
+            **kw,
+            pandas_backend=self._pandas_backend,
         )
         if sync_labels and new_index is not None:
             result.synchronize_labels(axis=0)
@@ -3833,6 +3871,7 @@ def n_ary_op(
                 self.copy_columns_cache(copy_lengths=True),
                 row_lengths,
                 self._column_widths_cache,
+                pandas_backend=self._pandas_backend,
             )
             new_right_frames = [
                 self.__constructor__(
@@ -3841,6 +3880,7 @@ def n_ary_op(
                     right_frame.copy_columns_cache(copy_lengths=True),
                     row_lengths,
                     right_frame._column_widths_cache,
+                    pandas_backend=self._pandas_backend,
                 )
                 for right_parts, right_frame in zip(list_of_right_parts, right_frames)
             ]
@@ -3878,6 +3918,7 @@ def n_ary_op(
             row_lengths,
             column_widths,
             dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
     @lazy_metadata_decorator(apply_axis="both")
@@ -4005,7 +4046,13 @@ def _compute_new_widths():
                         new_widths = None
 
         return self.__constructor__(
-            new_partitions, new_index, new_columns, new_lengths, new_widths, new_dtypes
+            new_partitions,
+            new_index,
+            new_columns,
+            new_lengths,
+            new_widths,
+            new_dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
     def _apply_func_to_range_partitioning_broadcast(
@@ -4080,6 +4127,7 @@ def _apply_func_to_range_partitioning_broadcast(
             index=new_index,
             columns=new_columns,
             dtypes=new_dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
     @lazy_metadata_decorator(apply_axis="both")
@@ -4428,6 +4476,7 @@ def join_cols(df, *cols):
                 new_partitions,
                 index=result.copy_index_cache(),
                 row_lengths=result._row_lengths_cache,
+                pandas_backend=self._pandas_backend,
             )
 
         if (
@@ -4504,7 +4553,10 @@ def groupby_reduce(
             axis, self._partitions, by_parts, map_func, reduce_func, apply_indices
         )
         return self.__constructor__(
-            new_partitions, index=new_index, columns=new_columns
+            new_partitions,
+            index=new_index,
+            columns=new_columns,
+            pandas_backend=self._pandas_backend,
         )
 
     @classmethod
@@ -4689,6 +4741,7 @@ def transpose(self):
             self._column_widths_cache,
             self._row_lengths_cache,
             dtypes=new_dtypes,
+            pandas_backend=self._pandas_backend,
         )
 
     @lazy_metadata_decorator(apply_axis="both")
@@ -4876,6 +4929,7 @@ def remote_fn(df, name, caselist):  # pragma: no cover
                         columns,
                         row_lengths,
                         column_widths,
+                        pandas_backend=self._pandas_backend,
                     )
                     for part in list_of_right_parts
                 )
@@ -4947,4 +5001,5 @@ def map_data(
             index=self.index,
             row_lengths=lengths,
             column_widths=[1],
+            pandas_backend=self._pandas_backend,
         )
diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py
index 7d5293b1017..6c2e795a523 100644
--- a/modin/core/storage_formats/pandas/aggregations.py
+++ b/modin/core/storage_formats/pandas/aggregations.py
@@ -13,6 +13,8 @@
 
 """Contains implementations for aggregation functions."""
 
+from __future__ import annotations
+
 from enum import Enum
 from typing import TYPE_CHECKING, Callable, Tuple
 
@@ -38,7 +40,7 @@ class Method(Enum):
     @classmethod
     def build_corr_method(
         cls,
-    ) -> Callable[["PandasQueryCompiler", str, int, bool], "PandasQueryCompiler"]:
+    ) -> Callable[[PandasQueryCompiler, str, int, bool], PandasQueryCompiler]:
         """
         Build a query compiler method computing the correlation matrix.
 
@@ -49,12 +51,12 @@ def build_corr_method(
         """
 
         def corr_method(
-            qc: "PandasQueryCompiler",
+            qc: PandasQueryCompiler,
             method: str,
             min_periods: int = 1,
             numeric_only: bool = True,
-        ) -> "PandasQueryCompiler":
-            if method != "pearson":
+        ) -> PandasQueryCompiler:
+            if method != "pearson" or qc._modin_frame._pandas_backend == "pyarrow":
                 return super(type(qc), qc).corr(
                     method=method, min_periods=min_periods, numeric_only=numeric_only
                 )
@@ -103,7 +105,7 @@ def corr_method(
     @classmethod
     def build_cov_method(
         cls,
-    ) -> Callable[["PandasQueryCompiler", int, int], "PandasQueryCompiler"]:
+    ) -> Callable[[PandasQueryCompiler, int, int], PandasQueryCompiler]:
         """
         Build a query compiler method computing the covariance matrix.
 
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 866a2456fae..83d42965a9e 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -1858,7 +1858,29 @@ def isin_func(df, values):
     map = Map.register(pandas.DataFrame.map)
     # Will it work with pyarrow backend?
     conj = Map.register(lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df)))
-    convert_dtypes = Fold.register(pandas.DataFrame.convert_dtypes)
+
+    def convert_dtypes(
+        self,
+        infer_objects: bool = True,
+        convert_string: bool = True,
+        convert_integer: bool = True,
+        convert_boolean: bool = True,
+        convert_floating: bool = True,
+        dtype_backend: str = "numpy_nullable",
+    ):
+        result = Fold.register(pandas.DataFrame.convert_dtypes)(
+            self,
+            infer_objects=infer_objects,
+            convert_string=convert_string,
+            convert_integer=convert_integer,
+            convert_boolean=convert_boolean,
+            convert_floating=convert_floating,
+            dtype_backend=dtype_backend,
+        )
+        if dtype_backend == "pyarrow":
+            result._modin_frame._pandas_backend = "pyarrow"
+        return result
+
     invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy")
     isna = Map.register(pandas.DataFrame.isna, dtypes="bool")
     # better way to distinguish methods for NumPy API?
diff --git a/modin/distributed/dataframe/pandas/partitions.py b/modin/distributed/dataframe/pandas/partitions.py
index cac3bec93b6..62a05ff81d3 100644
--- a/modin/distributed/dataframe/pandas/partitions.py
+++ b/modin/distributed/dataframe/pandas/partitions.py
@@ -90,7 +90,7 @@ def unwrap_partitions(
             f"Only API Layer objects may be passed in here, got {type(api_layer_object)} instead."
         )
 
-    modin_frame = api_layer_object._query_compiler._modin_frame  # type: ignore[attr-defined]
+    modin_frame = api_layer_object._query_compiler._modin_frame
     modin_frame._propagate_index_objs(None)
     if axis is None:
 
@@ -122,7 +122,7 @@ def get_block(partition: PartitionUnionType) -> np.ndarray:
                 ]
 
         actual_engine = type(
-            api_layer_object._query_compiler._modin_frame._partitions[0][0]  # type: ignore[attr-defined]
+            api_layer_object._query_compiler._modin_frame._partitions[0][0]
         ).__name__
         if actual_engine in (
             "PandasOnRayDataframePartition",
diff --git a/modin/logging/logger_decorator.py b/modin/logging/logger_decorator.py
index 301fb02562b..662f7d1de73 100644
--- a/modin/logging/logger_decorator.py
+++ b/modin/logging/logger_decorator.py
@@ -19,7 +19,7 @@
 
 from functools import wraps
 from types import FunctionType, MethodType
-from typing import Any, Callable, Dict, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, Optional, Tuple, Type, TypeVar, Union, overload
 
 from modin.config import LogMode
 
@@ -28,6 +28,9 @@
 _MODIN_LOGGER_NOWRAP = "__modin_logging_nowrap__"
 
 
+Fn = TypeVar("Fn", bound=Callable)
+
+
 def disable_logging(func: Callable) -> Callable:
     """
     Disable logging of one particular function. Useful for decorated classes.
@@ -46,11 +49,17 @@ def disable_logging(func: Callable) -> Callable:
     return func
 
 
+@overload
+def enable_logging(modin_layer: Fn) -> Fn:
+    # This helps preserve typings when the decorator is used without parentheses
+    ...
+
+
 def enable_logging(
-    modin_layer: Union[str, Callable, Type] = "PANDAS-API",
+    modin_layer: Union[str, Fn, Type] = "PANDAS-API",
     name: Optional[str] = None,
     log_level: LogLevel = LogLevel.INFO,
-) -> Callable:
+) -> Callable[[Fn], Fn]:
     """
     Log Decorator used on specific Modin functions or classes.
 
@@ -76,7 +85,7 @@ def enable_logging(
         # def func()
         return enable_logging()(modin_layer)
 
-    def decorator(obj: Any) -> Any:
+    def decorator(obj: Fn) -> Fn:
         """Decorate function or class to add logs to Modin API function(s)."""
         if isinstance(obj, type):
             seen: Dict[Any, Any] = {}
diff --git a/modin/pandas/general.py b/modin/pandas/general.py
index aeff9986f35..6c79752847d 100644
--- a/modin/pandas/general.py
+++ b/modin/pandas/general.py
@@ -415,7 +415,7 @@ def value_counts(
     )
 
 
-@_inherit_docstrings(pandas.concat, apilink="pandas.concat")
+# @_inherit_docstrings(pandas.concat, apilink="pandas.concat")
 @enable_logging
 def concat(
     objs: "Iterable[DataFrame | Series] | Mapping[Hashable, DataFrame | Series]",
diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py
index a8858cf799a..a1070d892b7 100644
--- a/modin/tests/pandas/dataframe/test_binary.py
+++ b/modin/tests/pandas/dataframe/test_binary.py
@@ -85,9 +85,16 @@ def test_math_functions(other, axis, op):
         # lambda == "series_or_list"
         pytest.xfail(reason="different behavior")
 
-    eval_general(
-        *create_test_dfs(data), lambda df: getattr(df, op)(other(df, axis), axis=axis)
-    )
+    md_df, pd_df = create_test_dfs(data)
+    if op in ("mod", "rmod") and any("pyarrow" in str(dtype) for dtype in pd_df.dtypes):
+        with pytest.raises(NotImplementedError):
+            eval_general(
+                md_df, pd_df, lambda df: getattr(df, op)(other(df, axis), axis=axis)
+            )
+    else:
+        eval_general(
+            md_df, pd_df, lambda df: getattr(df, op)(other(df, axis), axis=axis)
+        )
 
 
 @pytest.mark.parametrize("other", [lambda df: 2, lambda df: df])
@@ -465,8 +472,8 @@ def test_non_commutative_multiply():
     eval_general(modin_df, pandas_df, lambda s: s * integer)
 
 
-# TODO: just for developing purpose; remove `xfail` mark
-@pytest.mark.xfail
+# TODO: just for developing purpose; remove `skip` mark
+@pytest.mark.skip
 @pytest.mark.parametrize(
     "op",
     [

From 068f67ddc6c99a62095794cde2085d2120944f0d Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 6 May 2024 21:26:01 +0200
Subject: [PATCH 25/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/pandas/dataframe/dataframe.py  | 2 +-
 modin/core/storage_formats/pandas/query_compiler.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 8f642592856..c33377a08ae 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -3453,7 +3453,7 @@ def broadcast_apply_select_indices(
         )
 
     def construct_dtype(self, dtype: str, backend: Optional[str]):
-        if backend is None:
+        if backend is None or dtype == "category":
             return pandas.api.types.pandas_dtype(dtype)
         elif backend == "pyarrow":
             return pandas.api.types.pandas_dtype(f"{dtype}[{backend}]")
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 83d42965a9e..33e05a6c491 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -2272,6 +2272,9 @@ def clip(self, lower, upper, **kwargs):
     corr = CorrCovBuilder.build_corr_method()
 
     def cov(self, min_periods=None, ddof=1):
+        if self._modin_frame._pandas_backend == "pyarrow":
+            return super().cov(min_periods=min_periods, ddof=ddof)
+        # _nancorr use numpy which incompatible with pandas dataframes on pyarrow
         return self._nancorr(min_periods=min_periods, cov=True, ddof=ddof)
 
     def _nancorr(self, min_periods=1, cov=False, ddof=1):

From 45c1d1ff51a65146bec27937fbae1042b8dc85fd Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 7 May 2024 10:56:05 +0200
Subject: [PATCH 26/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/pandas/dataframe/dataframe.py |  5 +++--
 modin/tests/pandas/dataframe/test_map_metadata.py  | 10 +++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index c33377a08ae..604d1db03d4 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -1681,7 +1681,7 @@ def astype(self, col_dtypes, errors: str = "raise"):
                     if new_dtypes is None:
                         new_dtypes = self_dtypes.copy()
                     # Update the new dtype series to the proper pandas dtype
-                    # TODO: pyarrow backend? We don't need to add an implicit backend for `astype`
+                    # We don't need to add an implicit backend for `astype`
                     new_dtype = pandas.api.types.pandas_dtype(dtype)
                     if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"):
                         # FIXME: https://github.com/dask/distributed/issues/8585
@@ -1711,7 +1711,8 @@ def astype_builder(df):
             # Assume that the dtype is a scalar.
             if not (col_dtypes == self_dtypes).all():
                 new_dtypes = self_dtypes.copy()
-                new_dtype = self.construct_dtype(col_dtypes, self._pandas_backend)
+                # We don't need to add an implicit backend for `astype`
+                new_dtype = pandas.api.types.pandas_dtype(col_dtypes)
                 if Engine.get() == "Dask" and hasattr(new_dtype, "_is_materialized"):
                     # FIXME: https://github.com/dask/distributed/issues/8585
                     _ = new_dtype._materialize_categories()
diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py
index ab7a7fa4a31..b4980118922 100644
--- a/modin/tests/pandas/dataframe/test_map_metadata.py
+++ b/modin/tests/pandas/dataframe/test_map_metadata.py
@@ -1429,6 +1429,9 @@ def comparator(df1, df2):
         elif idx == 2:
             # FIXME: https://github.com/modin-project/modin/issues/7080
             expected_exception = False
+
+        if any("pyarrow" in str(dtype) for dtype in pandas_df.dtypes):
+            pytest.xfail(reason="ValueError(2)")
         eval_insert(
             modin_df,
             pandas_df,
@@ -1683,12 +1686,13 @@ def test___neg__(request, data):
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test___invert__(data, request):
     expected_exception = None
+    md_df, pd_df = create_test_dfs(data)
     if "float_nan_data" in request.node.callspec.id:
         # FIXME: https://github.com/modin-project/modin/issues/7081
         expected_exception = False
-    eval_general(
-        *create_test_dfs(data), lambda df: ~df, expected_exception=expected_exception
-    )
+        if any("pyarrow" in str(dtype) for dtype in pd_df.dtypes):
+            pytest.xfail(reason="pyarrow.lib.ArrowNotImplementedError")
+    eval_general(md_df, pd_df, lambda df: ~df, expected_exception=expected_exception)
 
 
 def test___invert___bool():

From b114314f9471539226cde6dd143df05e8a2e1bed Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 7 May 2024 12:49:41 +0200
Subject: [PATCH 27/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/pandas/dataframe/dataframe.py          | 6 ++++--
 .../implementations/hdk_on_native/calcite_serializer.py     | 1 -
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 604d1db03d4..7c8f6e2ef6a 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -21,6 +21,7 @@
 from __future__ import annotations
 
 import datetime
+from functools import cached_property
 import re
 from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Optional, Union
 
@@ -109,8 +110,8 @@ class PandasDataframe(
     _dtypes: Optional[ModinDtypes] = None
     _pandas_backend: str = None
 
-    @pandas.util.cache_readonly
-    def __constructor__(self) -> Callable[..., PandasDataframe]:
+    @cached_property
+    def __constructor__(self) -> type[PandasDataframe]:
         """
         Create a new instance of this object.
 
@@ -1745,6 +1746,7 @@ def astype_builder(df):
             new_frame = self._partition_mgr_cls.lazy_map_partitions(
                 self._partitions, astype_builder
             )
+        # TODO: recompute _pandas_backend (it can be changed)
         return self.__constructor__(
             new_frame,
             self.copy_index_cache(copy_lengths=True),
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py
index b00e73dc745..7099751dafe 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py
@@ -67,7 +67,6 @@ class CalciteSerializer:
         "datetime64": "TIMESTAMP",
     }
 
-    # TODO: Is it necessary to use more general types here (not dependent on NumPy)?
     _INT_OPTS = {
         np.int8: ("TINYINT", 3),
         np.int16: ("SMALLINT", 5),

From c597f7ffd5de330b02f11bc91845e7651e80cfa6 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Wed, 8 May 2024 00:36:38 +0200
Subject: [PATCH 28/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/pandas/dataframe/dataframe.py          | 2 +-
 .../native/implementations/hdk_on_native/dataframe/utils.py | 6 ++++--
 modin/pandas/general.py                                     | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 7c8f6e2ef6a..a103921bc72 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -21,8 +21,8 @@
 from __future__ import annotations
 
 import datetime
-from functools import cached_property
 import re
+from functools import cached_property
 from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Optional, Union
 
 import numpy as np
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py
index 4f749cf0e3b..f99cc256baa 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py
@@ -23,7 +23,7 @@
 import pandas
 import pyarrow as pa
 from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
-from pandas.core.dtypes.common import _get_dtype
+from pandas.core.dtypes.common import _get_dtype, is_string_dtype
 from pyarrow.types import is_dictionary
 
 from modin.pandas.indexing import is_range_like
@@ -504,7 +504,9 @@ def to_arrow_type(dtype) -> pa.lib.DataType:
     -------
     pa.lib.DataType
     """
-    return pandas.api.types.pandas_dtype(dtype).pyarrow_dtype
+    if is_string_dtype(dtype):
+        return pa.from_numpy_dtype(str)
+    return pa.from_numpy_dtype(dtype)
 
 
 def get_common_arrow_type(t1: pa.lib.DataType, t2: pa.lib.DataType) -> pa.lib.DataType:
diff --git a/modin/pandas/general.py b/modin/pandas/general.py
index 6c79752847d..aeff9986f35 100644
--- a/modin/pandas/general.py
+++ b/modin/pandas/general.py
@@ -415,7 +415,7 @@ def value_counts(
     )
 
 
-# @_inherit_docstrings(pandas.concat, apilink="pandas.concat")
+@_inherit_docstrings(pandas.concat, apilink="pandas.concat")
 @enable_logging
 def concat(
     objs: "Iterable[DataFrame | Series] | Mapping[Hashable, DataFrame | Series]",

From 9562144575fe0b8246a534ff543510b5befceb5b Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 10 May 2024 13:30:09 +0200
Subject: [PATCH 29/50] updates

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../dataframe/pandas/dataframe/dataframe.py   | 45 ++++++++++++++++---
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index a103921bc72..fd882539c47 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -108,7 +108,7 @@ class PandasDataframe(
     _index_cache: ModinIndex = None
     _columns_cache: ModinIndex = None
     _dtypes: Optional[ModinDtypes] = None
-    _pandas_backend: str = None
+    _pandas_backend: Optional[str] = None
 
     @cached_property
     def __constructor__(self) -> type[PandasDataframe]:
@@ -1321,6 +1321,7 @@ def _take_2d_positional(
             new_row_lengths,
             new_col_widths,
             new_dtypes,
+            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
 
@@ -1460,7 +1461,9 @@ def from_labels(self) -> PandasDataframe:
             new_column_names = pandas.Index(level_names, tupleize_cols=False)
         new_columns = new_column_names.append(self.columns)
 
-        def from_labels_executor(df, **kwargs):
+        def from_labels_executor(
+            df: pandas.DataFrame, **kwargs
+        ) -> pandas.DataFrame:  # pragma: no cover
             # Setting the names here ensures that external and internal metadata always match.
             df.index.names = new_column_names
 
@@ -1496,6 +1499,7 @@ def from_labels_executor(df, **kwargs):
             row_lengths=self._row_lengths_cache,
             column_widths=new_column_widths,
             dtypes=new_dtypes,
+            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
         # Set flag for propagating deferred row labels across dataframe partitions
@@ -1629,6 +1633,7 @@ def _reorder_labels(self, row_positions=None, col_positions=None):
             new_lengths,
             new_widths,
             new_dtypes,
+            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
 
@@ -1649,6 +1654,7 @@ def copy(self):
             self._row_lengths_cache,
             self._column_widths_cache,
             self.copy_dtypes_cache(),
+            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
 
@@ -1754,6 +1760,7 @@ def astype_builder(df):
             self._row_lengths_cache,
             self._column_widths_cache,
             new_dtypes,
+            # TODO: backend can be changed
             pandas_backend=self._pandas_backend,
         )
 
@@ -2115,6 +2122,7 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None):
             *new_axes,
             *new_axes_lengths,
             dtypes,
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
         return result
@@ -2301,6 +2309,7 @@ def map(
             self._row_lengths_cache,
             self._column_widths_cache,
             dtypes=dtypes,
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -2381,6 +2390,7 @@ def fold(self, axis, func, new_columns=None):
             self.copy_columns_cache(copy_lengths=True),
             self._row_lengths_cache,
             self._column_widths_cache,
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -2428,6 +2438,7 @@ def infer_types(self, col_labels: List[str]) -> PandasDataframe:
             self._row_lengths_cache,
             self._column_widths_cache,
             new_dtypes,
+            # CHECKED: backend may be changed depending on `new_cols_dtypes`
             pandas_backend=self._pandas_backend,
         )
 
@@ -2534,6 +2545,7 @@ def combine_and_apply(
                 self._row_lengths_cache,
                 [len(self.columns)] if self.has_materialized_columns else None,
                 self.copy_dtypes_cache(),
+                # CHECKED: backend preserved
                 pandas_backend=self._pandas_backend,
             )
         else:
@@ -2838,6 +2850,7 @@ def filter(self, axis: Union[Axis, int], condition: Callable) -> PandasDataframe
             *new_axes,
             *new_lengths,
             self.copy_dtypes_cache() if axis == Axis.COL_WISE else None,
+            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
 
@@ -2897,6 +2910,7 @@ def explode(self, axis: Union[int, Axis], func: Callable) -> PandasDataframe:
             new_columns,
             row_lengths,
             column_widths,
+            # TODO: need check
             pandas_backend=self._pandas_backend,
         )
 
@@ -2925,6 +2939,7 @@ def combine(self) -> PandasDataframe:
                 else None
             ),
             dtypes=self.copy_dtypes_cache(),
+            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
         result.synchronize_labels()
@@ -3081,6 +3096,7 @@ def apply_full_axis_select_indices(
             None,
             None,
             dtypes=new_dtypes,
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -3176,6 +3192,7 @@ def apply_select_indices(
                 lengths_objs[0],
                 lengths_objs[1],
                 new_dtypes,
+                # CHECKED: backend may be changed depending on function
                 pandas_backend=self._pandas_backend,
             )
         else:
@@ -3204,6 +3221,7 @@ def apply_select_indices(
                 self._row_lengths_cache,
                 self._column_widths_cache,
                 new_dtypes,
+                # CHECKED: backend may be changed depending on function
                 pandas_backend=self._pandas_backend,
             )
 
@@ -3310,6 +3328,7 @@ def _pick_axis(get_axis, sizes_cache):
             new_row_lengths,
             new_column_widths,
             dtypes=dtypes,
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -3375,14 +3394,14 @@ def broadcast_apply_select_indices(
         self,
         axis,
         func,
-        other,
+        other: PandasDataframe,
         apply_indices=None,
         numeric_indices=None,
         keep_remaining=False,
         broadcast_all=True,
         new_index=None,
         new_columns=None,
-    ):
+    ) -> PandasDataframe:
         """
         Apply a function to select indices at specified axis and broadcast partitions of `other` Modin DataFrame.
 
@@ -3452,6 +3471,7 @@ def broadcast_apply_select_indices(
             new_partitions,
             index=new_index,
             columns=new_columns,
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -3576,6 +3596,7 @@ def broadcast_apply_full_axis(
             else:
                 if new_columns is None:
                     assert not is_list_like(dtypes)
+                    # CHECKED: backend may be changed depending on function
                     dtype = self.construct_dtype(dtypes, self._pandas_backend)
                     kw["dtypes"] = ModinDtypes(DtypesDescriptor(remaining_dtype=dtype))
                 else:
@@ -3583,6 +3604,7 @@ def broadcast_apply_full_axis(
                         pandas.Series(dtypes, index=new_columns)
                         if is_list_like(dtypes)
                         else pandas.Series(
+                            # CHECKED: backend may be changed depending on function
                             [self.construct_dtype(dtypes, self._pandas_backend)]
                             * len(new_columns),
                             index=new_columns,
@@ -3652,6 +3674,7 @@ def broadcast_apply_full_axis(
             index=new_index,
             columns=new_columns,
             **kw,
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
         if sync_labels and new_index is not None:
@@ -3832,12 +3855,12 @@ def _copartition(
     def n_ary_op(
         self,
         op,
-        right_frames: list,
+        right_frames: list[PandasDataframe],
         join_type="outer",
         copartition_along_columns=True,
         labels="replace",
         dtypes: Optional[pandas.Series] = None,
-    ):
+    ) -> PandasDataframe:
         """
         Perform an n-opary operation by joining with other Modin DataFrame(s).
 
@@ -3874,6 +3897,7 @@ def n_ary_op(
                 self.copy_columns_cache(copy_lengths=True),
                 row_lengths,
                 self._column_widths_cache,
+                # CHECKED: backend preserved
                 pandas_backend=self._pandas_backend,
             )
             new_right_frames = [
@@ -3883,6 +3907,7 @@ def n_ary_op(
                     right_frame.copy_columns_cache(copy_lengths=True),
                     row_lengths,
                     right_frame._column_widths_cache,
+                    # CHECKED: backend preserved
                     pandas_backend=self._pandas_backend,
                 )
                 for right_parts, right_frame in zip(list_of_right_parts, right_frames)
@@ -3921,6 +3946,7 @@ def n_ary_op(
             row_lengths,
             column_widths,
             dtypes,
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -4055,6 +4081,7 @@ def _compute_new_widths():
             new_lengths,
             new_widths,
             new_dtypes,
+            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
 
@@ -4130,6 +4157,7 @@ def _apply_func_to_range_partitioning_broadcast(
             index=new_index,
             columns=new_columns,
             dtypes=new_dtypes,
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -4479,6 +4507,7 @@ def join_cols(df, *cols):
                 new_partitions,
                 index=result.copy_index_cache(),
                 row_lengths=result._row_lengths_cache,
+                # CHECKED: backend may be changed depending on function
                 pandas_backend=self._pandas_backend,
             )
 
@@ -4559,6 +4588,7 @@ def groupby_reduce(
             new_partitions,
             index=new_index,
             columns=new_columns,
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -4744,6 +4774,7 @@ def transpose(self):
             self._column_widths_cache,
             self._row_lengths_cache,
             dtypes=new_dtypes,
+            # TODO: backend preserved?
             pandas_backend=self._pandas_backend,
         )
 
@@ -4932,6 +4963,7 @@ def remote_fn(df, name, caselist):  # pragma: no cover
                         columns,
                         row_lengths,
                         column_widths,
+                        # CHECKED: backend may be changed depending on function
                         pandas_backend=self._pandas_backend,
                     )
                     for part in list_of_right_parts
@@ -5004,5 +5036,6 @@ def map_data(
             index=self.index,
             row_lengths=lengths,
             column_widths=[1],
+            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )

From 8b93500af4ec2863c6777b9a1d7098eb70aa2faf Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Sun, 12 May 2024 23:12:36 +0200
Subject: [PATCH 30/50] fixes after merge

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/storage_formats/base/query_compiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
index 2add02740d5..a84662c31f4 100644
--- a/modin/core/storage_formats/base/query_compiler.py
+++ b/modin/core/storage_formats/base/query_compiler.py
@@ -22,7 +22,7 @@
 import abc
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Callable, Hashable, List, Optional
+from typing import TYPE_CHECKING, Hashable, List, Optional
 
 import numpy as np
 import pandas

From ae861e3aa6f092cc154c536bb1875e21e4237741 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 May 2024 01:02:21 +0200
Subject: [PATCH 31/50] new approach

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/algebra/binary.py        | 19 +----
 modin/core/dataframe/algebra/fold.py          | 13 +---
 modin/core/dataframe/algebra/map.py           |  9 +--
 modin/core/dataframe/algebra/tree_reduce.py   |  2 +-
 .../dataframe/pandas/dataframe/dataframe.py   | 72 ++++++-------------
 .../pandas/partitioning/partition_manager.py  |  5 +-
 .../storage_formats/base/query_compiler.py    |  5 +-
 modin/pandas/utils.py                         |  9 +++
 8 files changed, 40 insertions(+), 94 deletions(-)

diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py
index acc84460b0e..b55df138bfd 100644
--- a/modin/core/dataframe/algebra/binary.py
+++ b/modin/core/dataframe/algebra/binary.py
@@ -85,7 +85,6 @@ def maybe_compute_dtypes_common_cast(
         # belong to the intersection, these will be NaN columns in the result
         mismatch_columns = columns_first ^ columns_second
     elif isinstance(second, dict):
-        # TODO: pyarrow backend
         dtypes_second = {
             key: pandas.api.types.pandas_dtype(type(value))
             for key, value in second.items()
@@ -98,7 +97,6 @@ def maybe_compute_dtypes_common_cast(
         mismatch_columns = columns_first.difference(columns_second)
     else:
         if isinstance(second, (list, tuple)):
-            # TODO: pyarrow backend
             second_dtypes_list = (
                 [pandas.api.types.pandas_dtype(type(value)) for value in second]
                 if axis == 1
@@ -107,7 +105,6 @@ def maybe_compute_dtypes_common_cast(
                 else [np.array(second).dtype] * len(dtypes_first)
             )
         elif is_scalar(second) or isinstance(second, np.ndarray):
-            # TODO: pyarrow backend
             try:
                 dtype = getattr(second, "dtype", None) or pandas.api.types.pandas_dtype(
                     type(second)
@@ -133,7 +130,6 @@ def maybe_compute_dtypes_common_cast(
         mismatch_columns = []
 
     # If at least one column doesn't match, the result of the non matching column would be nan.
-    # TODO: pyarrow backend
     nan_dtype = pandas.api.types.pandas_dtype(type(np.nan))
     dtypes = None
     if func is not None:
@@ -249,7 +245,7 @@ def try_compute_new_dtypes(
     infer_dtypes : {"common_cast", "try_sample", "bool", None}, default: None
         How dtypes should be infered (see ``Binary.register`` doc for more info).
     result_dtype : np.dtype, optional
-        NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter. Only NumPy?
+        NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter.
     axis : int, default: 0
         Axis to perform the binary operation along.
     func : callable(pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame, optional
@@ -264,19 +260,8 @@ def try_compute_new_dtypes(
 
     try:
         if infer_dtypes == "bool" or is_bool_dtype(result_dtype):
-            # dataframe can contain types of different backends at the same time, for example:
-            # (Pdb) (pandas.DataFrame([[1,2,3], [4,5,6]]).astype({0: "int64[pyarrow]"}) > 4).dtypes
-            # 0    bool[pyarrow]
-            # 1             bool
-            # 2             bool
-            # dtype: object
-            backend = ""
-            if any("pyarrow" in str(x) for x in first.dtypes) or any(
-                "pyarrow" in str(x) for x in second.dtypes
-            ):
-                backend = "[pyarrow]"
             dtypes = maybe_build_dtypes_series(
-                first, second, dtype=pandas.api.types.pandas_dtype(f"bool{backend}")
+                first, second, dtype=pandas.api.types.pandas_dtype(bool)
             )
         elif infer_dtypes == "common_cast":
             dtypes = maybe_compute_dtypes_common_cast(
diff --git a/modin/core/dataframe/algebra/fold.py b/modin/core/dataframe/algebra/fold.py
index 9f6673a3e0a..419a0b56903 100644
--- a/modin/core/dataframe/algebra/fold.py
+++ b/modin/core/dataframe/algebra/fold.py
@@ -13,21 +13,14 @@
 
 """Module houses builder class for Fold operator."""
 
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Callable
-
 from .operator import Operator
 
-if TYPE_CHECKING:
-    from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
-
 
 class Fold(Operator):
     """Builder class for Fold functions."""
 
     @classmethod
-    def register(cls, fold_function) -> Callable[..., PandasQueryCompiler]:
+    def register(cls, fold_function):
         """
         Build Fold operator that will be performed across rows/columns.
 
@@ -42,9 +35,7 @@ def register(cls, fold_function) -> Callable[..., PandasQueryCompiler]:
             Function that takes query compiler and executes Fold function.
         """
 
-        def caller(
-            query_compiler: PandasQueryCompiler, fold_axis=None, *args, **kwargs
-        ) -> PandasQueryCompiler:
+        def caller(query_compiler, fold_axis=None, *args, **kwargs):
             """
             Execute Fold function against passed query compiler.
 
diff --git a/modin/core/dataframe/algebra/map.py b/modin/core/dataframe/algebra/map.py
index aefebe6c017..57b21f6e1b0 100644
--- a/modin/core/dataframe/algebra/map.py
+++ b/modin/core/dataframe/algebra/map.py
@@ -13,15 +13,8 @@
 
 """Module houses builder class for Map operator."""
 
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
 from .operator import Operator
 
-if TYPE_CHECKING:
-    from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
-
 
 class Map(Operator):
     """Builder class for Map operator."""
@@ -48,7 +41,7 @@ def register(cls, function, *call_args, **call_kwds):
             Function that takes query compiler and executes map function.
         """
 
-        def caller(query_compiler: PandasQueryCompiler, *args, **kwargs):
+        def caller(query_compiler, *args, **kwargs):
             """Execute Map function against passed query compiler."""
             shape_hint = call_kwds.pop("shape_hint", None) or query_compiler._shape_hint
             return query_compiler.__constructor__(
diff --git a/modin/core/dataframe/algebra/tree_reduce.py b/modin/core/dataframe/algebra/tree_reduce.py
index 8a30196cbeb..fa7b731e6f5 100644
--- a/modin/core/dataframe/algebra/tree_reduce.py
+++ b/modin/core/dataframe/algebra/tree_reduce.py
@@ -35,7 +35,7 @@ def register(
         axis : int, optional
             Specifies axis to apply function along.
         compute_dtypes : callable(pandas.Series, *func_args, **func_kwargs) -> np.dtype, optional
-            Callable for computing dtypes. Only NumPy?
+            Callable for computing dtypes.
 
         Returns
         -------
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index fd882539c47..8059f1c988d 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -65,7 +65,11 @@
 from modin.logging import ClassLogger
 from modin.logging.config import LogLevel
 from modin.pandas.indexing import is_range_like
-from modin.pandas.utils import check_both_not_none, is_full_grab_slice
+from modin.pandas.utils import (
+    check_both_not_none,
+    get_pandas_backend,
+    is_full_grab_slice,
+)
 from modin.utils import MODIN_UNNAMED_SERIES_LABEL
 
 
@@ -136,8 +140,15 @@ def __init__(
         self.set_columns_cache(columns)
         self._row_lengths_cache = row_lengths
         self._column_widths_cache = column_widths
-        self.set_dtypes_cache(dtypes)
         self._pandas_backend = pandas_backend
+        if not pandas_backend == "pyarrow":
+            # In this case, the type precomputation may be incorrect; we need
+            # to know the type algebra precisely. Considering the number of operations
+            # and different combinations of backends, the best solution would be to
+            # introduce optimizations gradually, with a large number of tests.
+            self.set_dtypes_cache(dtypes)
+        else:
+            self.set_dtypes_cache(None)
 
         self._validate_axes_lengths()
         self._filter_empties(compute_metadata=False)
@@ -406,6 +417,9 @@ def dtypes(self):
         else:
             dtypes = self._compute_dtypes()
             self.set_dtypes_cache(dtypes)
+            # During materialization, we can find out the backend and, if it
+            # is suitable, use the ability to pre-calculate types.
+            self._pandas_backend = get_pandas_backend(dtypes)
         return dtypes
 
     def get_dtypes_set(self):
@@ -1321,7 +1335,6 @@ def _take_2d_positional(
             new_row_lengths,
             new_col_widths,
             new_dtypes,
-            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
 
@@ -1499,7 +1512,6 @@ def from_labels_executor(
             row_lengths=self._row_lengths_cache,
             column_widths=new_column_widths,
             dtypes=new_dtypes,
-            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
         # Set flag for propagating deferred row labels across dataframe partitions
@@ -1633,7 +1645,6 @@ def _reorder_labels(self, row_positions=None, col_positions=None):
             new_lengths,
             new_widths,
             new_dtypes,
-            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
 
@@ -1654,7 +1665,6 @@ def copy(self):
             self._row_lengths_cache,
             self._column_widths_cache,
             self.copy_dtypes_cache(),
-            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
 
@@ -1688,7 +1698,6 @@ def astype(self, col_dtypes, errors: str = "raise"):
                     if new_dtypes is None:
                         new_dtypes = self_dtypes.copy()
                     # Update the new dtype series to the proper pandas dtype
-                    # We don't need to add an implicit backend for `astype`
                     new_dtype = pandas.api.types.pandas_dtype(dtype)
                     if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"):
                         # FIXME: https://github.com/dask/distributed/issues/8585
@@ -1718,7 +1727,6 @@ def astype_builder(df):
             # Assume that the dtype is a scalar.
             if not (col_dtypes == self_dtypes).all():
                 new_dtypes = self_dtypes.copy()
-                # We don't need to add an implicit backend for `astype`
                 new_dtype = pandas.api.types.pandas_dtype(col_dtypes)
                 if Engine.get() == "Dask" and hasattr(new_dtype, "_is_materialized"):
                     # FIXME: https://github.com/dask/distributed/issues/8585
@@ -1752,7 +1760,7 @@ def astype_builder(df):
             new_frame = self._partition_mgr_cls.lazy_map_partitions(
                 self._partitions, astype_builder
             )
-        # TODO: recompute _pandas_backend (it can be changed)
+
         return self.__constructor__(
             new_frame,
             self.copy_index_cache(copy_lengths=True),
@@ -1760,8 +1768,7 @@ def astype_builder(df):
             self._row_lengths_cache,
             self._column_widths_cache,
             new_dtypes,
-            # TODO: backend can be changed
-            pandas_backend=self._pandas_backend,
+            pandas_backend=get_pandas_backend(new_dtypes),
         )
 
     def numeric_columns(self, include_bool=True):
@@ -2113,7 +2120,7 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None):
             dtypes = self.copy_dtypes_cache()
         elif dtypes is not None:
             dtypes = pandas.Series(
-                [self.construct_dtype(dtypes, self._pandas_backend)] * len(new_axes[1]),
+                [pandas.api.types.pandas_dtype(dtypes)] * len(new_axes[1]),
                 index=new_axes[1],
             )
 
@@ -2122,7 +2129,6 @@ def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None):
             *new_axes,
             *new_axes_lengths,
             dtypes,
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
         return result
@@ -2299,7 +2305,7 @@ def map(
                 # Materializing lazy columns in order to build dtype's index
                 new_columns = new_columns.get(return_lengths=False)
             dtypes = pandas.Series(
-                [self.construct_dtype(dtypes, self._pandas_backend)] * len(new_columns),
+                [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
                 index=new_columns,
             )
         return self.__constructor__(
@@ -2309,7 +2315,6 @@ def map(
             self._row_lengths_cache,
             self._column_widths_cache,
             dtypes=dtypes,
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -2390,7 +2395,6 @@ def fold(self, axis, func, new_columns=None):
             self.copy_columns_cache(copy_lengths=True),
             self._row_lengths_cache,
             self._column_widths_cache,
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -2438,7 +2442,6 @@ def infer_types(self, col_labels: List[str]) -> PandasDataframe:
             self._row_lengths_cache,
             self._column_widths_cache,
             new_dtypes,
-            # CHECKED: backend may be changed depending on `new_cols_dtypes`
             pandas_backend=self._pandas_backend,
         )
 
@@ -2545,7 +2548,6 @@ def combine_and_apply(
                 self._row_lengths_cache,
                 [len(self.columns)] if self.has_materialized_columns else None,
                 self.copy_dtypes_cache(),
-                # CHECKED: backend preserved
                 pandas_backend=self._pandas_backend,
             )
         else:
@@ -2850,7 +2852,6 @@ def filter(self, axis: Union[Axis, int], condition: Callable) -> PandasDataframe
             *new_axes,
             *new_lengths,
             self.copy_dtypes_cache() if axis == Axis.COL_WISE else None,
-            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
 
@@ -2910,7 +2911,6 @@ def explode(self, axis: Union[int, Axis], func: Callable) -> PandasDataframe:
             new_columns,
             row_lengths,
             column_widths,
-            # TODO: need check
             pandas_backend=self._pandas_backend,
         )
 
@@ -2939,7 +2939,6 @@ def combine(self) -> PandasDataframe:
                 else None
             ),
             dtypes=self.copy_dtypes_cache(),
-            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
         result.synchronize_labels()
@@ -3096,7 +3095,6 @@ def apply_full_axis_select_indices(
             None,
             None,
             dtypes=new_dtypes,
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -3192,7 +3190,6 @@ def apply_select_indices(
                 lengths_objs[0],
                 lengths_objs[1],
                 new_dtypes,
-                # CHECKED: backend may be changed depending on function
                 pandas_backend=self._pandas_backend,
             )
         else:
@@ -3221,7 +3218,6 @@ def apply_select_indices(
                 self._row_lengths_cache,
                 self._column_widths_cache,
                 new_dtypes,
-                # CHECKED: backend may be changed depending on function
                 pandas_backend=self._pandas_backend,
             )
 
@@ -3328,7 +3324,6 @@ def _pick_axis(get_axis, sizes_cache):
             new_row_lengths,
             new_column_widths,
             dtypes=dtypes,
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -3471,18 +3466,9 @@ def broadcast_apply_select_indices(
             new_partitions,
             index=new_index,
             columns=new_columns,
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
-    def construct_dtype(self, dtype: str, backend: Optional[str]):
-        if backend is None or dtype == "category":
-            return pandas.api.types.pandas_dtype(dtype)
-        elif backend == "pyarrow":
-            return pandas.api.types.pandas_dtype(f"{dtype}[{backend}]")
-        else:
-            raise NotImplementedError
-
     @lazy_metadata_decorator(apply_axis="both")
     def broadcast_apply_full_axis(
         self,
@@ -3596,17 +3582,14 @@ def broadcast_apply_full_axis(
             else:
                 if new_columns is None:
                     assert not is_list_like(dtypes)
-                    # CHECKED: backend may be changed depending on function
-                    dtype = self.construct_dtype(dtypes, self._pandas_backend)
+                    dtype = pandas.api.types.pandas_dtype(dtypes)
                     kw["dtypes"] = ModinDtypes(DtypesDescriptor(remaining_dtype=dtype))
                 else:
                     kw["dtypes"] = (
                         pandas.Series(dtypes, index=new_columns)
                         if is_list_like(dtypes)
                         else pandas.Series(
-                            # CHECKED: backend may be changed depending on function
-                            [self.construct_dtype(dtypes, self._pandas_backend)]
-                            * len(new_columns),
+                            [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
                             index=new_columns,
                         )
                     )
@@ -3674,7 +3657,6 @@ def broadcast_apply_full_axis(
             index=new_index,
             columns=new_columns,
             **kw,
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
         if sync_labels and new_index is not None:
@@ -3897,7 +3879,6 @@ def n_ary_op(
                 self.copy_columns_cache(copy_lengths=True),
                 row_lengths,
                 self._column_widths_cache,
-                # CHECKED: backend preserved
                 pandas_backend=self._pandas_backend,
             )
             new_right_frames = [
@@ -3907,7 +3888,6 @@ def n_ary_op(
                     right_frame.copy_columns_cache(copy_lengths=True),
                     row_lengths,
                     right_frame._column_widths_cache,
-                    # CHECKED: backend preserved
                     pandas_backend=self._pandas_backend,
                 )
                 for right_parts, right_frame in zip(list_of_right_parts, right_frames)
@@ -3946,7 +3926,6 @@ def n_ary_op(
             row_lengths,
             column_widths,
             dtypes,
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -4081,7 +4060,6 @@ def _compute_new_widths():
             new_lengths,
             new_widths,
             new_dtypes,
-            # CHECKED: backend preserved
             pandas_backend=self._pandas_backend,
         )
 
@@ -4157,7 +4135,6 @@ def _apply_func_to_range_partitioning_broadcast(
             index=new_index,
             columns=new_columns,
             dtypes=new_dtypes,
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -4507,7 +4484,6 @@ def join_cols(df, *cols):
                 new_partitions,
                 index=result.copy_index_cache(),
                 row_lengths=result._row_lengths_cache,
-                # CHECKED: backend may be changed depending on function
                 pandas_backend=self._pandas_backend,
             )
 
@@ -4588,7 +4564,6 @@ def groupby_reduce(
             new_partitions,
             index=new_index,
             columns=new_columns,
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
 
@@ -4774,7 +4749,6 @@ def transpose(self):
             self._column_widths_cache,
             self._row_lengths_cache,
             dtypes=new_dtypes,
-            # TODO: backend preserved?
             pandas_backend=self._pandas_backend,
         )
 
@@ -4963,7 +4937,6 @@ def remote_fn(df, name, caselist):  # pragma: no cover
                         columns,
                         row_lengths,
                         column_widths,
-                        # CHECKED: backend may be changed depending on function
                         pandas_backend=self._pandas_backend,
                     )
                     for part in list_of_right_parts
@@ -5036,6 +5009,5 @@ def map_data(
             index=self.index,
             row_lengths=lengths,
             column_widths=[1],
-            # CHECKED: backend may be changed depending on function
             pandas_backend=self._pandas_backend,
         )
diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py
index 86b8ed5cd81..0f2e99cfb22 100644
--- a/modin/core/dataframe/pandas/partitioning/partition_manager.py
+++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py
@@ -40,6 +40,7 @@
 from modin.error_message import ErrorMessage
 from modin.logging import ClassLogger
 from modin.logging.config import LogLevel
+from modin.pandas.utils import get_pandas_backend
 
 if TYPE_CHECKING:
     from modin.core.dataframe.pandas.dataframe.utils import ShuffleFunctions
@@ -985,9 +986,7 @@ def update_bar(f):
         parts = cls.split_pandas_df_into_partitions(
             df, row_chunksize, col_chunksize, update_bar
         )
-        backend = None
-        if any(isinstance(x, pandas.ArrowDtype) for x in df.dtypes):
-            backend = "pyarrow"
+        backend = get_pandas_backend(df.dtypes)
         if ProgressBar.get():
             pbar.close()
         if not return_dims:
diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
index a84662c31f4..31e3feb7bdc 100644
--- a/modin/core/storage_formats/base/query_compiler.py
+++ b/modin/core/storage_formats/base/query_compiler.py
@@ -6754,10 +6754,7 @@ def case_when(self, caselist):  # noqa: PR01, RT01, D200
         ]
         return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist)
 
-    def construct_dtype(self, dtype: str, backend: Optional[str]):
-        return self._modin_frame.construct_dtype(dtype, backend)
-
-    def get_backend(self) -> str:
+    def get_backend(self) -> Optional[str]:
         return self._modin_frame._pandas_backend
 
     def repartition(self, axis=None):
diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py
index 3a9ad6aa13d..6039baa2c7b 100644
--- a/modin/pandas/utils.py
+++ b/modin/pandas/utils.py
@@ -13,6 +13,8 @@
 
 """Implement utils for pandas component."""
 
+from __future__ import annotations
+
 from typing import Iterator, Optional, Tuple
 
 import numpy as np
@@ -116,6 +118,13 @@ def is_scalar(obj):
     return not isinstance(obj, BasePandasDataset) and pandas_is_scalar(obj)
 
 
+def get_pandas_backend(dtypes: pandas.Series) -> str | None:
+    backend = None
+    if any(isinstance(x, pandas.ArrowDtype) for x in dtypes):
+        backend = "pyarrow"
+    return backend
+
+
 def is_full_grab_slice(slc, sequence_len=None):
     """
     Check that the passed slice grabs the whole sequence.

From 5b18cfdce766a89bebf87b0fe26cea749c08cf39 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 May 2024 01:23:09 +0200
Subject: [PATCH 32/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../core/storage_formats/base/query_compiler.py |  1 -
 .../core/storage_formats/pandas/aggregations.py |  2 --
 .../storage_formats/pandas/query_compiler.py    |  1 -
 .../distributed/dataframe/pandas/partitions.py  |  4 ++--
 modin/logging/logger_decorator.py               | 17 ++++-------------
 modin/numpy/indexing.py                         |  4 ++--
 modin/pandas/base.py                            | 12 ++++++------
 modin/pandas/dataframe.py                       |  3 ---
 modin/pandas/indexing.py                        |  4 ++--
 9 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
index 31e3feb7bdc..1d6cc719d17 100644
--- a/modin/core/storage_formats/base/query_compiler.py
+++ b/modin/core/storage_formats/base/query_compiler.py
@@ -4288,7 +4288,6 @@ def get_positions_from_labels(self, row_loc, col_loc):
                     # `Index.get_indexer_for` works much faster with numpy arrays than with python lists,
                     # so although we lose some time here on converting to numpy, `Index.get_indexer_for`
                     # speedup covers the loss that we gain here.
-                    # TODO: pyarrow backend?
                     axis_loc = np.array(axis_loc, dtype=axis_labels.dtype)
                 axis_lookup = axis_labels.get_indexer_for(axis_loc)
                 # `Index.get_indexer_for` sets -1 value for missing labels, we have to verify whether
diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py
index 6c2e795a523..fd7d84f49d8 100644
--- a/modin/core/storage_formats/pandas/aggregations.py
+++ b/modin/core/storage_formats/pandas/aggregations.py
@@ -67,7 +67,6 @@ def corr_method(
                     qc._modin_frame.copy_columns_cache(),
                 )
                 new_dtypes = pandas.Series(
-                    # TODO: pyarrow backend?
                     np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
                     index=new_columns,
                 )
@@ -77,7 +76,6 @@ def corr_method(
                 new_columns = old_dtypes[old_dtypes.map(is_numeric_dtype)].index
                 new_index = new_columns.copy()
                 new_dtypes = pandas.Series(
-                    # TODO: pyarrow backend?
                     np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
                     index=new_columns,
                 )
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 33e05a6c491..3df16fee11b 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -3088,7 +3088,6 @@ def mapper(df: pandas.DataFrame):
                     )
                     # we have to keep other columns so setting their mask
                     # values with `False`
-                    # TODO: pyarrow backend?
                     mask = pandas.Series(
                         np.zeros(df.shape[1], dtype=bool), index=df.columns
                     )
diff --git a/modin/distributed/dataframe/pandas/partitions.py b/modin/distributed/dataframe/pandas/partitions.py
index 62a05ff81d3..cac3bec93b6 100644
--- a/modin/distributed/dataframe/pandas/partitions.py
+++ b/modin/distributed/dataframe/pandas/partitions.py
@@ -90,7 +90,7 @@ def unwrap_partitions(
             f"Only API Layer objects may be passed in here, got {type(api_layer_object)} instead."
         )
 
-    modin_frame = api_layer_object._query_compiler._modin_frame
+    modin_frame = api_layer_object._query_compiler._modin_frame  # type: ignore[attr-defined]
     modin_frame._propagate_index_objs(None)
     if axis is None:
 
@@ -122,7 +122,7 @@ def get_block(partition: PartitionUnionType) -> np.ndarray:
                 ]
 
         actual_engine = type(
-            api_layer_object._query_compiler._modin_frame._partitions[0][0]
+            api_layer_object._query_compiler._modin_frame._partitions[0][0]  # type: ignore[attr-defined]
         ).__name__
         if actual_engine in (
             "PandasOnRayDataframePartition",
diff --git a/modin/logging/logger_decorator.py b/modin/logging/logger_decorator.py
index 662f7d1de73..301fb02562b 100644
--- a/modin/logging/logger_decorator.py
+++ b/modin/logging/logger_decorator.py
@@ -19,7 +19,7 @@
 
 from functools import wraps
 from types import FunctionType, MethodType
-from typing import Any, Callable, Dict, Optional, Tuple, Type, TypeVar, Union, overload
+from typing import Any, Callable, Dict, Optional, Tuple, Type, Union
 
 from modin.config import LogMode
 
@@ -28,9 +28,6 @@
 _MODIN_LOGGER_NOWRAP = "__modin_logging_nowrap__"
 
 
-Fn = TypeVar("Fn", bound=Callable)
-
-
 def disable_logging(func: Callable) -> Callable:
     """
     Disable logging of one particular function. Useful for decorated classes.
@@ -49,17 +46,11 @@ def disable_logging(func: Callable) -> Callable:
     return func
 
 
-@overload
-def enable_logging(modin_layer: Fn) -> Fn:
-    # This helps preserve typings when the decorator is used without parentheses
-    ...
-
-
 def enable_logging(
-    modin_layer: Union[str, Fn, Type] = "PANDAS-API",
+    modin_layer: Union[str, Callable, Type] = "PANDAS-API",
     name: Optional[str] = None,
     log_level: LogLevel = LogLevel.INFO,
-) -> Callable[[Fn], Fn]:
+) -> Callable:
     """
     Log Decorator used on specific Modin functions or classes.
 
@@ -85,7 +76,7 @@ def enable_logging(
         # def func()
         return enable_logging()(modin_layer)
 
-    def decorator(obj: Fn) -> Fn:
+    def decorator(obj: Any) -> Any:
         """Decorate function or class to add logs to Modin API function(s)."""
         if isinstance(obj, type):
             seen: Dict[Any, Any] = {}
diff --git a/modin/numpy/indexing.py b/modin/numpy/indexing.py
index 4223ae3e513..b598577a34d 100644
--- a/modin/numpy/indexing.py
+++ b/modin/numpy/indexing.py
@@ -214,7 +214,7 @@ def boolean_mask_to_numeric(indexer):
             # `itertools.compress` masks `data` with the `selectors` mask,
             # works about ~10% faster than a pure list comprehension
             itertools.compress(data=range(len(indexer)), selectors=indexer),
-            dtype="int64",
+            dtype=np.int64,
         )
 
 
@@ -585,7 +585,7 @@ def _compute_lookup(self, row_loc, col_loc):
                     # `Index.__getitem__` works much faster with numpy arrays than with python lists,
                     # so although we lose some time here on converting to numpy, `Index.__getitem__`
                     # speedup covers the loss that we gain here.
-                    axis_loc = np.array(axis_loc, dtype="int64")
+                    axis_loc = np.array(axis_loc, dtype=np.int64)
                 # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation
                 # if there're no negative indices and so they don't not depend on the axis length.
                 if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any():
diff --git a/modin/pandas/base.py b/modin/pandas/base.py
index 322af7debbe..656644f426e 100644
--- a/modin/pandas/base.py
+++ b/modin/pandas/base.py
@@ -1534,7 +1534,7 @@ def eq(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get equality of `BasePandasDataset` and `other`, element-wise (binary operator `eq`).
         """
-        return self._binary_op("eq", other, axis=axis, level=level, dtypes="bool")
+        return self._binary_op("eq", other, axis=axis, level=level, dtypes=np.bool_)
 
     def explode(self, column, ignore_index: bool = False):  # noqa: PR01, RT01, D200
         """
@@ -1835,7 +1835,7 @@ def ge(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get greater than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ge`).
         """
-        return self._binary_op("ge", other, axis=axis, level=level, dtypes="bool")
+        return self._binary_op("ge", other, axis=axis, level=level, dtypes=np.bool_)
 
     def get(self, key, default=None):  # noqa: PR01, RT01, D200
         """
@@ -1851,7 +1851,7 @@ def gt(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get greater than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `gt`).
         """
-        return self._binary_op("gt", other, axis=axis, level=level, dtypes="bool")
+        return self._binary_op("gt", other, axis=axis, level=level, dtypes=np.bool_)
 
     def head(self, n=5):  # noqa: PR01, RT01, D200
         """
@@ -1983,13 +1983,13 @@ def le(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get less than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `le`).
         """
-        return self._binary_op("le", other, axis=axis, level=level, dtypes="bool")
+        return self._binary_op("le", other, axis=axis, level=level, dtypes=np.bool_)
 
     def lt(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get less than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `lt`).
         """
-        return self._binary_op("lt", other, axis=axis, level=level, dtypes="bool")
+        return self._binary_op("lt", other, axis=axis, level=level, dtypes=np.bool_)
 
     @property
     def loc(self):  # noqa: RT01, D200
@@ -2198,7 +2198,7 @@ def ne(self, other, axis="columns", level=None):  # noqa: PR01, RT01, D200
         """
         Get Not equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ne`).
         """
-        return self._binary_op("ne", other, axis=axis, level=level, dtypes="bool")
+        return self._binary_op("ne", other, axis=axis, level=level, dtypes=np.bool_)
 
     def notna(self):  # noqa: RT01, D200
         """
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index b5ef0b1643d..53c68494249 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -3067,7 +3067,6 @@ def _validate_dtypes_min_max(self, axis, numeric_only) -> DataFrame:
         ):
             # check if there are columns with dtypes datetime or timedelta
             if all(
-                # TODO: pyarrow backend?
                 dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
                 and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
                 for dtype in self.dtypes
@@ -3104,7 +3103,6 @@ def _validate_dtypes_sum_prod_mean(
             not axis
             and numeric_only is False
             and any(
-                # TODO: pyarrow backend?
                 dtype == pandas.api.types.pandas_dtype("datetime64[ns]")
                 for dtype in self.dtypes
             )
@@ -3125,7 +3123,6 @@ def _validate_dtypes_sum_prod_mean(
         ):
             # check if there are columns with dtypes datetime or timedelta
             if all(
-                # TODO: pyarrow backend?
                 dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
                 and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
                 for dtype in self.dtypes
diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py
index 316a75f82a7..d901b6dac99 100644
--- a/modin/pandas/indexing.py
+++ b/modin/pandas/indexing.py
@@ -232,7 +232,7 @@ def boolean_mask_to_numeric(indexer):
             # `itertools.compress` masks `data` with the `selectors` mask,
             # works about ~10% faster than a pure list comprehension
             itertools.compress(data=range(len(indexer)), selectors=indexer),
-            dtype="int64",
+            dtype=np.int64,
         )
 
 
@@ -1130,7 +1130,7 @@ def _compute_lookup(self, row_loc, col_loc):
                     # `Index.__getitem__` works much faster with numpy arrays than with python lists,
                     # so although we lose some time here on converting to numpy, `Index.__getitem__`
                     # speedup covers the loss that we gain here.
-                    axis_loc = np.array(axis_loc, dtype="int64")
+                    axis_loc = np.array(axis_loc, dtype=np.int64)
                 # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation
                 # if there're no negative indices and so they don't not depend on the axis length.
                 if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any():

From 9c6ce78c4990f87a04fc64449f5e1e5eb71125ce Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 May 2024 01:37:31 +0200
Subject: [PATCH 33/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../storage_formats/pandas/query_compiler.py  | 166 +++++++++---------
 1 file changed, 84 insertions(+), 82 deletions(-)

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 3df16fee11b..4ad25f5ace5 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -427,22 +427,22 @@ def to_numpy(self, **kwargs):
     combine_first = Binary.register(
         pandas.DataFrame.combine_first, infer_dtypes="common_cast"
     )
-    eq = Binary.register(pandas.DataFrame.eq, infer_dtypes="bool")
+    eq = Binary.register(pandas.DataFrame.eq, infer_dtypes=np.bool_)
     equals = Binary.register(
         lambda df, other: pandas.DataFrame([[df.equals(other)]]),
         join_type=None,
         labels="drop",
-        infer_dtypes="bool",
+        infer_dtypes=np.bool_,
     )
     floordiv = Binary.register(pandas.DataFrame.floordiv, infer_dtypes="try_sample")
-    ge = Binary.register(pandas.DataFrame.ge, infer_dtypes="bool")
-    gt = Binary.register(pandas.DataFrame.gt, infer_dtypes="bool")
-    le = Binary.register(pandas.DataFrame.le, infer_dtypes="bool")
-    lt = Binary.register(pandas.DataFrame.lt, infer_dtypes="bool")
+    ge = Binary.register(pandas.DataFrame.ge, infer_dtypes=np.bool_)
+    gt = Binary.register(pandas.DataFrame.gt, infer_dtypes=np.bool_)
+    le = Binary.register(pandas.DataFrame.le, infer_dtypes=np.bool_)
+    lt = Binary.register(pandas.DataFrame.lt, infer_dtypes=np.bool_)
     mod = Binary.register(pandas.DataFrame.mod, infer_dtypes="try_sample")
     mul = Binary.register(pandas.DataFrame.mul, infer_dtypes="try_sample")
     rmul = Binary.register(pandas.DataFrame.rmul, infer_dtypes="try_sample")
-    ne = Binary.register(pandas.DataFrame.ne, infer_dtypes="bool")
+    ne = Binary.register(pandas.DataFrame.ne, infer_dtypes=np.bool_)
     pow = Binary.register(pandas.DataFrame.pow, infer_dtypes="try_sample")
     radd = Binary.register(pandas.DataFrame.radd, infer_dtypes="try_sample")
     rfloordiv = Binary.register(pandas.DataFrame.rfloordiv, infer_dtypes="try_sample")
@@ -452,12 +452,12 @@ def to_numpy(self, **kwargs):
     rtruediv = Binary.register(pandas.DataFrame.rtruediv, infer_dtypes="try_sample")
     sub = Binary.register(pandas.DataFrame.sub, infer_dtypes="try_sample")
     truediv = Binary.register(pandas.DataFrame.truediv, infer_dtypes="try_sample")
-    __and__ = Binary.register(pandas.DataFrame.__and__, infer_dtypes="bool")
-    __or__ = Binary.register(pandas.DataFrame.__or__, infer_dtypes="bool")
-    __rand__ = Binary.register(pandas.DataFrame.__rand__, infer_dtypes="bool")
-    __ror__ = Binary.register(pandas.DataFrame.__ror__, infer_dtypes="bool")
-    __rxor__ = Binary.register(pandas.DataFrame.__rxor__, infer_dtypes="bool")
-    __xor__ = Binary.register(pandas.DataFrame.__xor__, infer_dtypes="bool")
+    __and__ = Binary.register(pandas.DataFrame.__and__, infer_dtypes=np.bool_)
+    __or__ = Binary.register(pandas.DataFrame.__or__, infer_dtypes=np.bool_)
+    __rand__ = Binary.register(pandas.DataFrame.__rand__, infer_dtypes=np.bool_)
+    __ror__ = Binary.register(pandas.DataFrame.__ror__, infer_dtypes=np.bool_)
+    __rxor__ = Binary.register(pandas.DataFrame.__rxor__, infer_dtypes=np.bool_)
+    __xor__ = Binary.register(pandas.DataFrame.__xor__, infer_dtypes=np.bool_)
     df_update = Binary.register(
         copy_df_for_func(pandas.DataFrame.update, display_name="update"),
         join_type="left",
@@ -475,19 +475,19 @@ def to_numpy(self, **kwargs):
         lambda df, other, *args, **kwargs: pandas.DataFrame(
             np.logical_and(df, other, *args, **kwargs)
         ),
-        infer_dtypes="bool",
+        infer_dtypes=np.bool_,
     )
     _logical_or = Binary.register(
         lambda df, other, *args, **kwargs: pandas.DataFrame(
             np.logical_or(df, other, *args, **kwargs)
         ),
-        infer_dtypes="bool",
+        infer_dtypes=np.bool_,
     )
     _logical_xor = Binary.register(
         lambda df, other, *args, **kwargs: pandas.DataFrame(
             np.logical_xor(df, other, *args, **kwargs)
         ),
-        infer_dtypes="bool",
+        infer_dtypes=np.bool_,
     )
 
     def where(self, cond, other, **kwargs):
@@ -943,7 +943,7 @@ def compute_dtypes_fn(dtypes, axis, **kwargs):
                 and any(is_bool_dtype(t) for t in dtypes)
                 and any(is_numeric_dtype(t) for t in dtypes)
             ):
-                return "object"
+                return np.object_
             # how to take into account backend here?
             return "float64"
 
@@ -1850,7 +1850,7 @@ def isin_func(df, values):
                 )
             return res
 
-        return Map.register(isin_func, shape_hint=shape_hint, dtypes="bool")(
+        return Map.register(isin_func, shape_hint=shape_hint, dtypes=np.bool_)(
             self, values
         )
 
@@ -1882,37 +1882,37 @@ def convert_dtypes(
         return result
 
     invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy")
-    isna = Map.register(pandas.DataFrame.isna, dtypes="bool")
+    isna = Map.register(pandas.DataFrame.isna, dtypes=np.bool_)
     # better way to distinguish methods for NumPy API?
     _isfinite = Map.register(
         lambda df, *args, **kwargs: pandas.DataFrame(np.isfinite(df, *args, **kwargs)),
-        dtypes="bool",
+        dtypes=np.bool_,
     )
     _isinf = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.isinf(df, *args, **kwargs)),
-        dtypes="bool",
+        dtypes=np.bool_,
     )
     _isnat = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.isnat(df, *args, **kwargs)),
-        dtypes="bool",
+        dtypes=np.bool_,
     )
     _isneginf = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.isneginf(df, *args, **kwargs)),
-        dtypes="bool",
+        dtypes=np.bool_,
     )
     _isposinf = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.isposinf(df, *args, **kwargs)),
-        dtypes="bool",
+        dtypes=np.bool_,
     )
     _iscomplex = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.iscomplex(df, *args, **kwargs)),
-        dtypes="bool",
+        dtypes=np.bool_,
     )
     _isreal = Map.register(  # Needed for numpy API
         lambda df, *args, **kwargs: pandas.DataFrame(np.isreal(df, *args, **kwargs)),
-        dtypes="bool",
+        dtypes=np.bool_,
     )
-    _logical_not = Map.register(np.logical_not, dtypes="bool")  # Needed for numpy API
+    _logical_not = Map.register(np.logical_not, dtypes=np.bool_)  # Needed for numpy API
     _tanh = Map.register(
         lambda df, *args, **kwargs: pandas.DataFrame(np.tanh(df, *args, **kwargs))
     )  # Needed for numpy API
@@ -1923,7 +1923,7 @@ def convert_dtypes(
         lambda df, *args, **kwargs: pandas.DataFrame(np.exp(df, *args, **kwargs))
     )  # Needed for numpy API
     negative = Map.register(pandas.DataFrame.__neg__)
-    notna = Map.register(pandas.DataFrame.notna, dtypes="bool")
+    notna = Map.register(pandas.DataFrame.notna, dtypes=np.bool_)
     round = Map.register(pandas.DataFrame.round)
     replace = Map.register(pandas.DataFrame.replace)
     series_view = Map.register(
@@ -1949,24 +1949,24 @@ def convert_dtypes(
 
     str_capitalize = Map.register(_str_map("capitalize"), dtypes="copy")
     str_center = Map.register(_str_map("center"), dtypes="copy")
-    str_contains = Map.register(_str_map("contains"), dtypes="bool")
-    str_count = Map.register(_str_map("count"), dtypes="int64")
-    str_endswith = Map.register(_str_map("endswith"), dtypes="bool")
-    str_find = Map.register(_str_map("find"), dtypes="int64")
+    str_contains = Map.register(_str_map("contains"), dtypes=np.bool_)
+    str_count = Map.register(_str_map("count"), dtypes=np.int64)
+    str_endswith = Map.register(_str_map("endswith"), dtypes=np.bool_)
+    str_find = Map.register(_str_map("find"), dtypes=np.int64)
     str_findall = Map.register(_str_map("findall"), dtypes="copy")
     str_get = Map.register(_str_map("get"), dtypes="copy")
-    str_index = Map.register(_str_map("index"), dtypes="int64")
-    str_isalnum = Map.register(_str_map("isalnum"), dtypes="bool")
-    str_isalpha = Map.register(_str_map("isalpha"), dtypes="bool")
-    str_isdecimal = Map.register(_str_map("isdecimal"), dtypes="bool")
-    str_isdigit = Map.register(_str_map("isdigit"), dtypes="bool")
-    str_islower = Map.register(_str_map("islower"), dtypes="bool")
-    str_isnumeric = Map.register(_str_map("isnumeric"), dtypes="bool")
-    str_isspace = Map.register(_str_map("isspace"), dtypes="bool")
-    str_istitle = Map.register(_str_map("istitle"), dtypes="bool")
-    str_isupper = Map.register(_str_map("isupper"), dtypes="bool")
+    str_index = Map.register(_str_map("index"), dtypes=np.int64)
+    str_isalnum = Map.register(_str_map("isalnum"), dtypes=np.bool_)
+    str_isalpha = Map.register(_str_map("isalpha"), dtypes=np.bool_)
+    str_isdecimal = Map.register(_str_map("isdecimal"), dtypes=np.bool_)
+    str_isdigit = Map.register(_str_map("isdigit"), dtypes=np.bool_)
+    str_islower = Map.register(_str_map("islower"), dtypes=np.bool_)
+    str_isnumeric = Map.register(_str_map("isnumeric"), dtypes=np.bool_)
+    str_isspace = Map.register(_str_map("isspace"), dtypes=np.bool_)
+    str_istitle = Map.register(_str_map("istitle"), dtypes=np.bool_)
+    str_isupper = Map.register(_str_map("isupper"), dtypes=np.bool_)
     str_join = Map.register(_str_map("join"), dtypes="copy")
-    str_len = Map.register(_str_map("len"), dtypes="int64")
+    str_len = Map.register(_str_map("len"), dtypes=np.int64)
     str_ljust = Map.register(_str_map("ljust"), dtypes="copy")
     str_lower = Map.register(_str_map("lower"), dtypes="copy")
     str_lstrip = Map.register(_str_map("lstrip"), dtypes="copy")
@@ -1995,8 +1995,8 @@ def str_extract(self, pat, flags, expand):
         return qc
 
     str_replace = Map.register(_str_map("replace"), dtypes="copy", shape_hint="column")
-    str_rfind = Map.register(_str_map("rfind"), dtypes="int64", shape_hint="column")
-    str_rindex = Map.register(_str_map("rindex"), dtypes="int64", shape_hint="column")
+    str_rfind = Map.register(_str_map("rfind"), dtypes=np.int64, shape_hint="column")
+    str_rindex = Map.register(_str_map("rindex"), dtypes=np.int64, shape_hint="column")
     str_rjust = Map.register(_str_map("rjust"), dtypes="copy", shape_hint="column")
     _str_rpartition = Map.register(
         _str_map("rpartition"), dtypes="copy", shape_hint="column"
@@ -2030,7 +2030,7 @@ def str_split(self, pat=None, n=-1, expand=False, regex=None):
         return self._str_split(pat=pat, n=n, expand=False, regex=regex)
 
     str_startswith = Map.register(
-        _str_map("startswith"), dtypes="bool", shape_hint="column"
+        _str_map("startswith"), dtypes=np.bool_, shape_hint="column"
     )
     str_strip = Map.register(_str_map("strip"), dtypes="copy", shape_hint="column")
     str_swapcase = Map.register(
@@ -2102,49 +2102,51 @@ def searchsorted(df):
 
     # Dt map partitions operations
 
-    dt_date = Map.register(_dt_prop_map("date"), dtypes="object")
-    dt_time = Map.register(_dt_prop_map("time"), dtypes="object")
-    dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes="object")
+    dt_date = Map.register(_dt_prop_map("date"), dtypes=np.object_)
+    dt_time = Map.register(_dt_prop_map("time"), dtypes=np.object_)
+    dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes=np.object_)
     dt_year = Map.register(_dt_prop_map("year"), dtypes="int32")
     dt_month = Map.register(_dt_prop_map("month"), dtypes="int32")
     dt_day = Map.register(_dt_prop_map("day"), dtypes="int32")
-    dt_hour = Map.register(_dt_prop_map("hour"), dtypes="int64")
-    dt_minute = Map.register(_dt_prop_map("minute"), dtypes="int64")
-    dt_second = Map.register(_dt_prop_map("second"), dtypes="int64")
-    dt_microsecond = Map.register(_dt_prop_map("microsecond"), dtypes="int64")
-    dt_nanosecond = Map.register(_dt_prop_map("nanosecond"), dtypes="int64")
-    dt_dayofweek = Map.register(_dt_prop_map("dayofweek"), dtypes="int64")
-    dt_weekday = Map.register(_dt_prop_map("weekday"), dtypes="int64")
-    dt_dayofyear = Map.register(_dt_prop_map("dayofyear"), dtypes="int64")
-    dt_quarter = Map.register(_dt_prop_map("quarter"), dtypes="int64")
-    dt_is_month_start = Map.register(_dt_prop_map("is_month_start"), dtypes="bool")
-    dt_is_month_end = Map.register(_dt_prop_map("is_month_end"), dtypes="bool")
-    dt_is_quarter_start = Map.register(_dt_prop_map("is_quarter_start"), dtypes="bool")
-    dt_is_quarter_end = Map.register(_dt_prop_map("is_quarter_end"), dtypes="bool")
-    dt_is_year_start = Map.register(_dt_prop_map("is_year_start"), dtypes="bool")
-    dt_is_year_end = Map.register(_dt_prop_map("is_year_end"), dtypes="bool")
-    dt_is_leap_year = Map.register(_dt_prop_map("is_leap_year"), dtypes="bool")
-    dt_daysinmonth = Map.register(_dt_prop_map("daysinmonth"), dtypes="int64")
-    dt_days_in_month = Map.register(_dt_prop_map("days_in_month"), dtypes="int64")
+    dt_hour = Map.register(_dt_prop_map("hour"), dtypes=np.int64)
+    dt_minute = Map.register(_dt_prop_map("minute"), dtypes=np.int64)
+    dt_second = Map.register(_dt_prop_map("second"), dtypes=np.int64)
+    dt_microsecond = Map.register(_dt_prop_map("microsecond"), dtypes=np.int64)
+    dt_nanosecond = Map.register(_dt_prop_map("nanosecond"), dtypes=np.int64)
+    dt_dayofweek = Map.register(_dt_prop_map("dayofweek"), dtypes=np.int64)
+    dt_weekday = Map.register(_dt_prop_map("weekday"), dtypes=np.int64)
+    dt_dayofyear = Map.register(_dt_prop_map("dayofyear"), dtypes=np.int64)
+    dt_quarter = Map.register(_dt_prop_map("quarter"), dtypes=np.int64)
+    dt_is_month_start = Map.register(_dt_prop_map("is_month_start"), dtypes=np.bool_)
+    dt_is_month_end = Map.register(_dt_prop_map("is_month_end"), dtypes=np.bool_)
+    dt_is_quarter_start = Map.register(
+        _dt_prop_map("is_quarter_start"), dtypes=np.bool_
+    )
+    dt_is_quarter_end = Map.register(_dt_prop_map("is_quarter_end"), dtypes=np.bool_)
+    dt_is_year_start = Map.register(_dt_prop_map("is_year_start"), dtypes=np.bool_)
+    dt_is_year_end = Map.register(_dt_prop_map("is_year_end"), dtypes=np.bool_)
+    dt_is_leap_year = Map.register(_dt_prop_map("is_leap_year"), dtypes=np.bool_)
+    dt_daysinmonth = Map.register(_dt_prop_map("daysinmonth"), dtypes=np.int64)
+    dt_days_in_month = Map.register(_dt_prop_map("days_in_month"), dtypes=np.int64)
     dt_asfreq = Map.register(_dt_func_map("asfreq"))
     dt_to_period = Map.register(_dt_func_map("to_period"))
-    dt_to_pydatetime = Map.register(_dt_func_map("to_pydatetime"), dtypes="object")
+    dt_to_pydatetime = Map.register(_dt_func_map("to_pydatetime"), dtypes=np.object_)
     dt_tz_localize = Map.register(_dt_func_map("tz_localize"))
     dt_tz_convert = Map.register(_dt_func_map("tz_convert"))
     dt_normalize = Map.register(_dt_func_map("normalize"))
-    dt_strftime = Map.register(_dt_func_map("strftime"), dtypes="object")
+    dt_strftime = Map.register(_dt_func_map("strftime"), dtypes=np.object_)
     dt_round = Map.register(_dt_func_map("round"))
     dt_floor = Map.register(_dt_func_map("floor"))
     dt_ceil = Map.register(_dt_func_map("ceil"))
-    dt_month_name = Map.register(_dt_func_map("month_name"), dtypes="object")
-    dt_day_name = Map.register(_dt_func_map("day_name"), dtypes="object")
-    dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes="object")
+    dt_month_name = Map.register(_dt_func_map("month_name"), dtypes=np.object_)
+    dt_day_name = Map.register(_dt_func_map("day_name"), dtypes=np.object_)
+    dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes=np.object_)
     dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes="float64")
-    dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes="int64")
-    dt_days = Map.register(_dt_prop_map("days"), dtypes="int64")
-    dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes="int64")
-    dt_nanoseconds = Map.register(_dt_prop_map("nanoseconds"), dtypes="int64")
-    dt_qyear = Map.register(_dt_prop_map("qyear"), dtypes="int64")
+    dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes=np.int64)
+    dt_days = Map.register(_dt_prop_map("days"), dtypes=np.int64)
+    dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes=np.int64)
+    dt_nanoseconds = Map.register(_dt_prop_map("nanoseconds"), dtypes=np.int64)
+    dt_qyear = Map.register(_dt_prop_map("qyear"), dtypes=np.int64)
     dt_start_time = Map.register(_dt_prop_map("start_time"))
     dt_end_time = Map.register(_dt_prop_map("end_time"))
     dt_to_timestamp = Map.register(_dt_func_map("to_timestamp"))
@@ -3190,7 +3192,7 @@ def _compute_duplicated(df):  # pragma: no cover
                 axis=1,
                 function=_compute_hash,
                 # TODO: pyarrow backend
-                dtypes="object",
+                dtypes=np.object_,
             )
         else:
             hashed_modin_frame = self._modin_frame
@@ -3199,7 +3201,7 @@ def _compute_duplicated(df):  # pragma: no cover
             func=_compute_duplicated,
             new_index=self._modin_frame.copy_index_cache(),
             new_columns=[MODIN_UNNAMED_SERIES_LABEL],
-            dtypes="bool",
+            dtypes=np.bool_,
             keep_partitioning=True,
         )
         return self.__constructor__(new_modin_frame, shape_hint="column")
@@ -3626,7 +3628,7 @@ def groupby_mean(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=Fals
                 )
 
         qc_with_converted_datetime_cols = (
-            self.astype({col: "int64" for col in datetime_cols.keys()})
+            self.astype({col: np.int64 for col in datetime_cols.keys()})
             if len(datetime_cols) > 0
             else self
         )
@@ -4480,13 +4482,13 @@ def map_fn(df):  # pragma: no cover
         if len(columns) == len(self.columns):
             # TODO: pyarrow backend
             new_modin_frame = self._modin_frame.apply_full_axis(
-                0, map_fn, new_index=self.index, dtypes="bool"
+                0, map_fn, new_index=self.index, dtypes=np.bool_
             )
             untouched_frame = None
         else:
             new_modin_frame = self._modin_frame.take_2d_labels_or_positional(
                 col_labels=columns
-            ).apply_full_axis(0, map_fn, new_index=self.index, dtypes="bool")
+            ).apply_full_axis(0, map_fn, new_index=self.index, dtypes=np.bool_)
             untouched_frame = self.drop(columns=columns)
         # If we mapped over all the data we are done. If not, we need to
         # prepend the `new_modin_frame` with the raw data from the columns that were

From a04b0a2b958f57b0548b4fa5aa9bd84ad8b1b797 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 May 2024 02:29:51 +0200
Subject: [PATCH 34/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/storage_formats/base/query_compiler.py   |  8 +-------
 modin/core/storage_formats/pandas/query_compiler.py | 10 +++++-----
 modin/pandas/dataframe.py                           |  4 ++--
 modin/tests/pandas/utils.py                         | 10 +---------
 4 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
index 1d6cc719d17..6258d43503a 100644
--- a/modin/core/storage_formats/base/query_compiler.py
+++ b/modin/core/storage_formats/base/query_compiler.py
@@ -22,7 +22,7 @@
 import abc
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Hashable, List, Optional
+from typing import Hashable, List, Optional
 
 import numpy as np
 import pandas
@@ -53,10 +53,6 @@
 
 from . import doc_utils
 
-if TYPE_CHECKING:
-    # TODO: should be ModinDataframe
-    from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe
-
 
 def _get_axis(axis):
     """
@@ -131,8 +127,6 @@ class BaseQueryCompiler(
     for a list of requirements for subclassing this object.
     """
 
-    _modin_frame: PandasDataframe
-
     def __wrap_in_qc(self, obj):
         """
         Wrap `obj` in query compiler.
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 4ad25f5ace5..9dfd15a69bf 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -945,7 +945,7 @@ def compute_dtypes_fn(dtypes, axis, **kwargs):
             ):
                 return np.object_
             # how to take into account backend here?
-            return "float64"
+            return np.float64
 
         return TreeReduce.register(
             map_fn,
@@ -2141,7 +2141,7 @@ def searchsorted(df):
     dt_month_name = Map.register(_dt_func_map("month_name"), dtypes=np.object_)
     dt_day_name = Map.register(_dt_func_map("day_name"), dtypes=np.object_)
     dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes=np.object_)
-    dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes="float64")
+    dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes=np.float64)
     dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes=np.int64)
     dt_days = Map.register(_dt_prop_map("days"), dtypes=np.int64)
     dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes=np.int64)
@@ -2323,7 +2323,7 @@ def map_func(df):  # pragma: no cover
             # Does it work with pyarrow backend?
             df_mask = np.isfinite(df)
 
-            result = np.empty((n_rows, n_cols), dtype="float64")
+            result = np.empty((n_rows, n_cols), dtype=np.float64)
 
             for i in range(n_rows):
                 df_ith_row = df[i]
@@ -2679,7 +2679,7 @@ def quantile_builder(df, **kwargs):
             lambda df: quantile_builder(df, **kwargs),
             new_index=q_index,
             new_columns=new_columns,
-            dtypes="float64",
+            dtypes=np.float64,
         )
         result = self.__constructor__(new_modin_frame)
         return result.transpose() if axis == 1 else result
@@ -2696,7 +2696,7 @@ def rank(self, **kwargs):
                 if not numeric_only
                 else None
             ),
-            dtypes="float64",
+            dtypes=np.float64,
             sync_labels=False,
         )
         return self.__constructor__(new_modin_frame)
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index 53c68494249..0d00f614454 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -1623,7 +1623,7 @@ def prod(
             and numeric_only is False
             and min_count > len(axis_to_apply)
             # Type inference is not so simple for pyarrow
-            and self._query_compiler.get_backend() == "default"
+            and self._query_compiler.get_backend() is not None
         ):
             new_index = self.columns if not axis else self.index
             # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10)
@@ -2153,7 +2153,7 @@ def sum(
             and numeric_only is False
             and min_count > len(axis_to_apply)
             # Type inference is not so simple for pyarrow
-            and self._query_compiler.get_backend() == "default"
+            and self._query_compiler.get_backend() is not None
         ):
             new_index = self.columns if not axis else self.index
             return Series(
diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py
index b04026393a9..1b085cb6614 100644
--- a/modin/tests/pandas/utils.py
+++ b/modin/tests/pandas/utils.py
@@ -1088,11 +1088,7 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs):
 
 
 def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, pandas.DataFrame]:
-    post_fn = kwargs.pop("post_fn", None)
-
-    if post_fn is None:
-        # TODO: REVERT ME
-        post_fn = lambda df: df.convert_dtypes(dtype_backend="pyarrow")  # noqa: E731
+    post_fn = kwargs.pop("post_fn", lambda df: df)
     return tuple(
         map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)])
     )
@@ -1108,10 +1104,6 @@ def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Se
     if sort:
         modin_series = modin_series.sort_values().reset_index(drop=True)
         pandas_series = pandas_series.sort_values().reset_index(drop=True)
-
-    # TODO: REVERT ME
-    modin_series = modin_series.convert_dtypes(dtype_backend="pyarrow")
-    pandas_series = pandas_series.convert_dtypes(dtype_backend="pyarrow")
     return modin_series, pandas_series
 
 

From 60101b511522382771deca8d3452f4b93794e0f0 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 May 2024 02:39:43 +0200
Subject: [PATCH 35/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../storage_formats/pandas/query_compiler.py  | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 9dfd15a69bf..afe1688a831 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -427,22 +427,22 @@ def to_numpy(self, **kwargs):
     combine_first = Binary.register(
         pandas.DataFrame.combine_first, infer_dtypes="common_cast"
     )
-    eq = Binary.register(pandas.DataFrame.eq, infer_dtypes=np.bool_)
+    eq = Binary.register(pandas.DataFrame.eq, infer_dtypes="bool")
     equals = Binary.register(
         lambda df, other: pandas.DataFrame([[df.equals(other)]]),
         join_type=None,
         labels="drop",
-        infer_dtypes=np.bool_,
+        infer_dtypes="bool",
     )
     floordiv = Binary.register(pandas.DataFrame.floordiv, infer_dtypes="try_sample")
-    ge = Binary.register(pandas.DataFrame.ge, infer_dtypes=np.bool_)
-    gt = Binary.register(pandas.DataFrame.gt, infer_dtypes=np.bool_)
-    le = Binary.register(pandas.DataFrame.le, infer_dtypes=np.bool_)
-    lt = Binary.register(pandas.DataFrame.lt, infer_dtypes=np.bool_)
+    ge = Binary.register(pandas.DataFrame.ge, infer_dtypes="bool")
+    gt = Binary.register(pandas.DataFrame.gt, infer_dtypes="bool")
+    le = Binary.register(pandas.DataFrame.le, infer_dtypes="bool")
+    lt = Binary.register(pandas.DataFrame.lt, infer_dtypes="bool")
     mod = Binary.register(pandas.DataFrame.mod, infer_dtypes="try_sample")
     mul = Binary.register(pandas.DataFrame.mul, infer_dtypes="try_sample")
     rmul = Binary.register(pandas.DataFrame.rmul, infer_dtypes="try_sample")
-    ne = Binary.register(pandas.DataFrame.ne, infer_dtypes=np.bool_)
+    ne = Binary.register(pandas.DataFrame.ne, infer_dtypes="bool")
     pow = Binary.register(pandas.DataFrame.pow, infer_dtypes="try_sample")
     radd = Binary.register(pandas.DataFrame.radd, infer_dtypes="try_sample")
     rfloordiv = Binary.register(pandas.DataFrame.rfloordiv, infer_dtypes="try_sample")
@@ -452,12 +452,12 @@ def to_numpy(self, **kwargs):
     rtruediv = Binary.register(pandas.DataFrame.rtruediv, infer_dtypes="try_sample")
     sub = Binary.register(pandas.DataFrame.sub, infer_dtypes="try_sample")
     truediv = Binary.register(pandas.DataFrame.truediv, infer_dtypes="try_sample")
-    __and__ = Binary.register(pandas.DataFrame.__and__, infer_dtypes=np.bool_)
-    __or__ = Binary.register(pandas.DataFrame.__or__, infer_dtypes=np.bool_)
-    __rand__ = Binary.register(pandas.DataFrame.__rand__, infer_dtypes=np.bool_)
-    __ror__ = Binary.register(pandas.DataFrame.__ror__, infer_dtypes=np.bool_)
-    __rxor__ = Binary.register(pandas.DataFrame.__rxor__, infer_dtypes=np.bool_)
-    __xor__ = Binary.register(pandas.DataFrame.__xor__, infer_dtypes=np.bool_)
+    __and__ = Binary.register(pandas.DataFrame.__and__, infer_dtypes="bool")
+    __or__ = Binary.register(pandas.DataFrame.__or__, infer_dtypes="bool")
+    __rand__ = Binary.register(pandas.DataFrame.__rand__, infer_dtypes="bool")
+    __ror__ = Binary.register(pandas.DataFrame.__ror__, infer_dtypes="bool")
+    __rxor__ = Binary.register(pandas.DataFrame.__rxor__, infer_dtypes="bool")
+    __xor__ = Binary.register(pandas.DataFrame.__xor__, infer_dtypes="bool")
     df_update = Binary.register(
         copy_df_for_func(pandas.DataFrame.update, display_name="update"),
         join_type="left",
@@ -475,19 +475,19 @@ def to_numpy(self, **kwargs):
         lambda df, other, *args, **kwargs: pandas.DataFrame(
             np.logical_and(df, other, *args, **kwargs)
         ),
-        infer_dtypes=np.bool_,
+        infer_dtypes="bool",
     )
     _logical_or = Binary.register(
         lambda df, other, *args, **kwargs: pandas.DataFrame(
             np.logical_or(df, other, *args, **kwargs)
         ),
-        infer_dtypes=np.bool_,
+        infer_dtypes="bool",
     )
     _logical_xor = Binary.register(
         lambda df, other, *args, **kwargs: pandas.DataFrame(
             np.logical_xor(df, other, *args, **kwargs)
         ),
-        infer_dtypes=np.bool_,
+        infer_dtypes="bool",
     )
 
     def where(self, cond, other, **kwargs):

From c25a41907e10e589901fce503287dbfd63458e6d Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 May 2024 12:20:09 +0200
Subject: [PATCH 36/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../storage_formats/pandas/query_compiler.py  | 22 ++++++++-----------
 modin/tests/pandas/dataframe/test_binary.py   |  2 --
 modin/tests/pandas/dataframe/test_default.py  |  4 +---
 3 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index a7332e5e590..228a8b8ab38 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -1950,7 +1950,7 @@ def convert_dtypes(
     str_capitalize = Map.register(_str_map("capitalize"), dtypes="copy")
     str_center = Map.register(_str_map("center"), dtypes="copy")
     str_contains = Map.register(_str_map("contains"), dtypes=np.bool_)
-    str_count = Map.register(_str_map("count"), dtypes=np.int64)
+    str_count = Map.register(_str_map("count"), dtypes=int)
     str_endswith = Map.register(_str_map("endswith"), dtypes=np.bool_)
     str_find = Map.register(_str_map("find"), dtypes=np.int64)
     str_findall = Map.register(_str_map("findall"), dtypes="copy")
@@ -1966,7 +1966,7 @@ def convert_dtypes(
     str_istitle = Map.register(_str_map("istitle"), dtypes=np.bool_)
     str_isupper = Map.register(_str_map("isupper"), dtypes=np.bool_)
     str_join = Map.register(_str_map("join"), dtypes="copy")
-    str_len = Map.register(_str_map("len"), dtypes=np.int64)
+    str_len = Map.register(_str_map("len"), dtypes=int)
     str_ljust = Map.register(_str_map("ljust"), dtypes="copy")
     str_lower = Map.register(_str_map("lower"), dtypes="copy")
     str_lstrip = Map.register(_str_map("lstrip"), dtypes="copy")
@@ -2105,9 +2105,9 @@ def searchsorted(df):
     dt_date = Map.register(_dt_prop_map("date"), dtypes=np.object_)
     dt_time = Map.register(_dt_prop_map("time"), dtypes=np.object_)
     dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes=np.object_)
-    dt_year = Map.register(_dt_prop_map("year"), dtypes="int32")
-    dt_month = Map.register(_dt_prop_map("month"), dtypes="int32")
-    dt_day = Map.register(_dt_prop_map("day"), dtypes="int32")
+    dt_year = Map.register(_dt_prop_map("year"), dtypes=np.int32)
+    dt_month = Map.register(_dt_prop_map("month"), dtypes=np.int32)
+    dt_day = Map.register(_dt_prop_map("day"), dtypes=np.int32)
     dt_hour = Map.register(_dt_prop_map("hour"), dtypes=np.int64)
     dt_minute = Map.register(_dt_prop_map("minute"), dtypes=np.int64)
     dt_second = Map.register(_dt_prop_map("second"), dtypes=np.int64)
@@ -2158,7 +2158,6 @@ def astype(self, col_dtypes, errors: str = "raise"):
         # other query compilers may not take care of error handling at the API
         # layer. This query compiler assumes there won't be any errors due to
         # invalid type keys.
-        # Function that can change the backend
         return self.__constructor__(
             self._modin_frame.astype(col_dtypes, errors=errors),
             shape_hint=self._shape_hint,
@@ -2320,7 +2319,6 @@ def map_func(df):  # pragma: no cover
             """Compute covariance or correlation matrix for the passed frame."""
             df = df.to_numpy()
             n_rows = df.shape[0]
-            # Does it work with pyarrow backend?
             df_mask = np.isfinite(df)
 
             result = np.empty((n_rows, n_cols), dtype=np.float64)
@@ -3191,8 +3189,7 @@ def _compute_duplicated(df):  # pragma: no cover
             hashed_modin_frame = self._modin_frame.reduce(
                 axis=1,
                 function=_compute_hash,
-                # TODO: pyarrow backend
-                dtypes=np.object_,
+                dtypes=pandas.api.types.pandas_dtype("O"),
             )
         else:
             hashed_modin_frame = self._modin_frame
@@ -3628,7 +3625,7 @@ def groupby_mean(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=Fals
                 )
 
         qc_with_converted_datetime_cols = (
-            self.astype({col: np.int64 for col in datetime_cols.keys()})
+            self.astype({col: "int64" for col in datetime_cols.keys()})
             if len(datetime_cols) > 0
             else self
         )
@@ -4480,15 +4477,14 @@ def map_fn(df):  # pragma: no cover
         # efficient if we are mapping over all of the data to do it this way
         # than it would be to reuse the code for specific columns.
         if len(columns) == len(self.columns):
-            # TODO: pyarrow backend
             new_modin_frame = self._modin_frame.apply_full_axis(
-                0, map_fn, new_index=self.index, dtypes=np.bool_
+                0, map_fn, new_index=self.index, dtypes=bool
             )
             untouched_frame = None
         else:
             new_modin_frame = self._modin_frame.take_2d_labels_or_positional(
                 col_labels=columns
-            ).apply_full_axis(0, map_fn, new_index=self.index, dtypes=np.bool_)
+            ).apply_full_axis(0, map_fn, new_index=self.index, dtypes=bool)
             untouched_frame = self.drop(columns=columns)
         # If we mapped over all the data we are done. If not, we need to
         # prepend the `new_modin_frame` with the raw data from the columns that were
diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py
index a1070d892b7..0a8aa80d6d3 100644
--- a/modin/tests/pandas/dataframe/test_binary.py
+++ b/modin/tests/pandas/dataframe/test_binary.py
@@ -472,8 +472,6 @@ def test_non_commutative_multiply():
     eval_general(modin_df, pandas_df, lambda s: s * integer)
 
 
-# TODO: just for developing purpose; remove `skip` mark
-@pytest.mark.skip
 @pytest.mark.parametrize(
     "op",
     [
diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py
index 173e90e8762..76af5a110c9 100644
--- a/modin/tests/pandas/dataframe/test_default.py
+++ b/modin/tests/pandas/dataframe/test_default.py
@@ -528,8 +528,6 @@ def test_info(data, verbose, max_cols, memory_usage, show_counts):
         assert modin_info[1:] == pandas_info[1:]
 
 
-# TODO: just for developing purpose; remove `xfail` mark
-@pytest.mark.xfail
 @pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
 @pytest.mark.parametrize("skipna", [False, True])
 @pytest.mark.parametrize("numeric_only", [False, True])
@@ -718,7 +716,7 @@ def test_pivot_table_data(data, index, columns, values, aggfunc, request):
         "callable_tree_reduce_func" in request.node.callspec.id
         and "int_data" in request.node.callspec.id
     ):
-        expected_exception = TypeError("'float' object is not callable")
+        expected_exception = TypeError("'numpy.float64' object is not callable")
 
     eval_general(
         md_df,

From 6e0c37e4b97c2a3d043f9b192392125ebfd9047b Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 May 2024 13:17:56 +0200
Subject: [PATCH 37/50] fixes

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../storage_formats/pandas/query_compiler.py  |  5 ++---
 modin/tests/pandas/utils.py                   | 21 ++++++++++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 228a8b8ab38..358ae635d47 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -943,9 +943,8 @@ def compute_dtypes_fn(dtypes, axis, **kwargs):
                 and any(is_bool_dtype(t) for t in dtypes)
                 and any(is_numeric_dtype(t) for t in dtypes)
             ):
-                return np.object_
-            # how to take into account backend here?
-            return np.float64
+                return "object"
+            return "float64"
 
         return TreeReduce.register(
             map_fn,
diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py
index 227c438ebba..1949097ce4f 100644
--- a/modin/tests/pandas/utils.py
+++ b/modin/tests/pandas/utils.py
@@ -1087,14 +1087,25 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs):
     )
 
 
-def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, pandas.DataFrame]:
-    post_fn = kwargs.pop("post_fn", lambda df: df)
+def create_test_dfs(
+    *args, post_fn=None, backend=None, **kwargs
+) -> tuple[pd.DataFrame, pandas.DataFrame]:
+    if post_fn is None:
+        post_fn = lambda df: (  # noqa: E731
+            df.convert_dtypes(dtype_backend=backend) if backend is not None else df
+        )
+    elif backend is not None:
+        post_fn = lambda df: post_fn(df).convert_dtypes(  # noqa: E731
+            dtype_backend=backend
+        )
     return tuple(
         map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)])
     )
 
 
-def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Series]:
+def create_test_series(
+    vals, sort=False, backend=None, **kwargs
+) -> tuple[pd.Series, pandas.Series]:
     if isinstance(vals, dict):
         modin_series = pd.Series(vals[next(iter(vals.keys()))], **kwargs)
         pandas_series = pandas.Series(vals[next(iter(vals.keys()))], **kwargs)
@@ -1104,6 +1115,10 @@ def create_test_series(vals, sort=False, **kwargs) -> tuple[pd.Series, pandas.Se
     if sort:
         modin_series = modin_series.sort_values().reset_index(drop=True)
         pandas_series = pandas_series.sort_values().reset_index(drop=True)
+
+    if backend is not None:
+        modin_series = modin_series.convert_dtypes(dtype_backend=backend)
+        pandas_series = pandas_series.convert_dtypes(dtype_backend=backend)
     return modin_series, pandas_series
 
 

From 778be0204bd323ada8d9c365ec708e1b421901a1 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 May 2024 15:22:46 +0200
Subject: [PATCH 38/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/base/dataframe/utils.py      |  6 +++++-
 modin/tests/pandas/dataframe/test_default.py      |  4 +++-
 modin/tests/pandas/dataframe/test_map_metadata.py | 10 +++-------
 modin/tests/pandas/dataframe/test_reduce.py       |  3 +--
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/modin/core/dataframe/base/dataframe/utils.py b/modin/core/dataframe/base/dataframe/utils.py
index c8e3f742193..66657ae76cb 100644
--- a/modin/core/dataframe/base/dataframe/utils.py
+++ b/modin/core/dataframe/base/dataframe/utils.py
@@ -24,6 +24,7 @@
 import pandas
 from pandas._typing import IndexLabel
 from pandas.api.types import is_scalar
+from pandas.core.dtypes.common import is_float_dtype, is_numeric_dtype
 
 
 class Axis(Enum):  # noqa: PR01
@@ -169,7 +170,10 @@ def is_trivial_index(index: pandas.Index) -> bool:
         return True
     if isinstance(index, pandas.RangeIndex):
         return index.start == 0 and index.step == 1
-    if not (isinstance(index, pandas.Index) and index.dtype == "int64"):
+    if not (
+        isinstance(index, pandas.Index)
+        and (is_numeric_dtype(index) and not is_float_dtype(index))
+    ):
         return False
     return (
         index.is_monotonic_increasing
diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py
index 76af5a110c9..64fb650f10c 100644
--- a/modin/tests/pandas/dataframe/test_default.py
+++ b/modin/tests/pandas/dataframe/test_default.py
@@ -778,6 +778,7 @@ def test_pivot_table_data(data, index, columns, values, aggfunc, request):
     [pytest.param("Custom name", id="str_name")],
 )
 @pytest.mark.parametrize("fill_value", [None, 0])
+@pytest.mark.parametrize("backend", [None, "pyarrow"])
 def test_pivot_table_margins(
     data,
     index,
@@ -786,13 +787,14 @@ def test_pivot_table_margins(
     aggfunc,
     margins_name,
     fill_value,
+    backend,
     request,
 ):
     expected_exception = None
     if "dict_func" in request.node.callspec.id:
         expected_exception = KeyError("Column(s) ['col28', 'col38'] do not exist")
     eval_general(
-        *create_test_dfs(data),
+        *create_test_dfs(data, backend=backend),
         operation=lambda df, *args, **kwargs: df.pivot_table(*args, **kwargs),
         index=index,
         columns=columns,
diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py
index b4980118922..ab7a7fa4a31 100644
--- a/modin/tests/pandas/dataframe/test_map_metadata.py
+++ b/modin/tests/pandas/dataframe/test_map_metadata.py
@@ -1429,9 +1429,6 @@ def comparator(df1, df2):
         elif idx == 2:
             # FIXME: https://github.com/modin-project/modin/issues/7080
             expected_exception = False
-
-        if any("pyarrow" in str(dtype) for dtype in pandas_df.dtypes):
-            pytest.xfail(reason="ValueError(2)")
         eval_insert(
             modin_df,
             pandas_df,
@@ -1686,13 +1683,12 @@ def test___neg__(request, data):
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test___invert__(data, request):
     expected_exception = None
-    md_df, pd_df = create_test_dfs(data)
     if "float_nan_data" in request.node.callspec.id:
         # FIXME: https://github.com/modin-project/modin/issues/7081
         expected_exception = False
-        if any("pyarrow" in str(dtype) for dtype in pd_df.dtypes):
-            pytest.xfail(reason="pyarrow.lib.ArrowNotImplementedError")
-    eval_general(md_df, pd_df, lambda df: ~df, expected_exception=expected_exception)
+    eval_general(
+        *create_test_dfs(data), lambda df: ~df, expected_exception=expected_exception
+    )
 
 
 def test___invert___bool():
diff --git a/modin/tests/pandas/dataframe/test_reduce.py b/modin/tests/pandas/dataframe/test_reduce.py
index 2105c165183..74c8285ba04 100644
--- a/modin/tests/pandas/dataframe/test_reduce.py
+++ b/modin/tests/pandas/dataframe/test_reduce.py
@@ -324,10 +324,9 @@ def test_sum(data, axis, skipna, is_transposed, request):
     df_equals(modin_result, pandas_result)
 
 
-@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+@pytest.mark.parametrize("dtype", ["int64", "Int64", "int64[pyarrow]"])
 def test_dtype_consistency(dtype):
     # test for issue #6781
-    # TODO: add pyarrow dtype
     res_dtype = pd.DataFrame([1, 2, 3, 4], dtype=dtype).sum().dtype
     assert res_dtype == pandas.api.types.pandas_dtype(dtype)
 

From 9d6d8394ae62a1ed520d69dd6d6d9d996ee138e0 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 May 2024 16:12:59 +0200
Subject: [PATCH 39/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/base/dataframe/utils.py      | 7 ++-----
 modin/core/dataframe/pandas/metadata/dtypes.py    | 2 +-
 modin/core/storage_formats/pandas/aggregations.py | 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/modin/core/dataframe/base/dataframe/utils.py b/modin/core/dataframe/base/dataframe/utils.py
index 66657ae76cb..7a1478ca5da 100644
--- a/modin/core/dataframe/base/dataframe/utils.py
+++ b/modin/core/dataframe/base/dataframe/utils.py
@@ -24,7 +24,7 @@
 import pandas
 from pandas._typing import IndexLabel
 from pandas.api.types import is_scalar
-from pandas.core.dtypes.common import is_float_dtype, is_numeric_dtype
+from pandas.core.dtypes.common import is_integer_dtype
 
 
 class Axis(Enum):  # noqa: PR01
@@ -170,10 +170,7 @@ def is_trivial_index(index: pandas.Index) -> bool:
         return True
     if isinstance(index, pandas.RangeIndex):
         return index.start == 0 and index.step == 1
-    if not (
-        isinstance(index, pandas.Index)
-        and (is_numeric_dtype(index) and not is_float_dtype(index))
-    ):
+    if not (isinstance(index, pandas.Index) and is_integer_dtype(index)):
         return False
     return (
         index.is_monotonic_increasing
diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py
index c7979704db2..ceb205ef74a 100644
--- a/modin/core/dataframe/pandas/metadata/dtypes.py
+++ b/modin/core/dataframe/pandas/metadata/dtypes.py
@@ -496,7 +496,7 @@ def _merge_dtypes(
                 # in the 'dtypes_matrix'
                 series = pandas.Series(dtypes, name=i)
                 dtypes_matrix = pandas.concat([dtypes_matrix, series], axis=1)
-                if val._know_all_names and val._remaining_dtype is None:
+                if not (val._know_all_names and val._remaining_dtype is None):
                     dtypes_matrix.fillna(
                         value={
                             # If we encountered a 'NaN' while 'val' describes all the columns, then
diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py
index fd7d84f49d8..094b202700e 100644
--- a/modin/core/storage_formats/pandas/aggregations.py
+++ b/modin/core/storage_formats/pandas/aggregations.py
@@ -56,7 +56,7 @@ def corr_method(
             min_periods: int = 1,
             numeric_only: bool = True,
         ) -> PandasQueryCompiler:
-            if method != "pearson" or qc._modin_frame._pandas_backend == "pyarrow":
+            if method != "pearson" or qc.get_backend() == "pyarrow":
                 return super(type(qc), qc).corr(
                     method=method, min_periods=min_periods, numeric_only=numeric_only
                 )

From 22f2db62f1ab9121bc80c95bfac8e048ad80c15b Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 May 2024 20:49:50 +0200
Subject: [PATCH 40/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/pandas/dataframe.py                   |  4 ++--
 modin/tests/pandas/dataframe/test_binary.py | 19 ++++++++-----------
 modin/tests/pandas/dataframe/test_reduce.py |  6 ++++++
 modin/tests/pandas/utils.py                 |  1 -
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index fa67d6ad41d..d5b15f5be38 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -1623,7 +1623,7 @@ def prod(
             and numeric_only is False
             and min_count > len(axis_to_apply)
             # Type inference is not so simple for pyarrow
-            and self._query_compiler.get_backend() is not None
+            and self._query_compiler.get_backend() is None
         ):
             new_index = self.columns if not axis else self.index
             # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10)
@@ -2153,7 +2153,7 @@ def sum(
             and numeric_only is False
             and min_count > len(axis_to_apply)
             # Type inference is not so simple for pyarrow
-            and self._query_compiler.get_backend() is not None
+            and self._query_compiler.get_backend() is None
         ):
             new_index = self.columns if not axis else self.index
             return Series(
diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py
index 0a8aa80d6d3..9a72cd9d0dc 100644
--- a/modin/tests/pandas/dataframe/test_binary.py
+++ b/modin/tests/pandas/dataframe/test_binary.py
@@ -75,7 +75,8 @@
         *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"),
     ],
 )
-def test_math_functions(other, axis, op):
+@pytest.mark.parametrize("backend", [None, "pyarrow"])
+def test_math_functions(other, axis, op, backend):
     data = test_data["float_nan_data"]
     if (op == "floordiv" or op == "rfloordiv") and axis == "rows":
         # lambda == "series_or_list"
@@ -85,16 +86,12 @@ def test_math_functions(other, axis, op):
         # lambda == "series_or_list"
         pytest.xfail(reason="different behavior")
 
-    md_df, pd_df = create_test_dfs(data)
-    if op in ("mod", "rmod") and any("pyarrow" in str(dtype) for dtype in pd_df.dtypes):
-        with pytest.raises(NotImplementedError):
-            eval_general(
-                md_df, pd_df, lambda df: getattr(df, op)(other(df, axis), axis=axis)
-            )
-    else:
-        eval_general(
-            md_df, pd_df, lambda df: getattr(df, op)(other(df, axis), axis=axis)
-        )
+    if op in ("mod", "rmod") and backend == "pyarrow":
+        pytest.skip(reason="Not implemented for pyarrow backend")
+    eval_general(
+        *create_test_dfs(data, backend=backend),
+        lambda df: getattr(df, op)(other(df, axis), axis=axis),
+    )
 
 
 @pytest.mark.parametrize("other", [lambda df: 2, lambda df: df])
diff --git a/modin/tests/pandas/dataframe/test_reduce.py b/modin/tests/pandas/dataframe/test_reduce.py
index 74c8285ba04..d6f76d68507 100644
--- a/modin/tests/pandas/dataframe/test_reduce.py
+++ b/modin/tests/pandas/dataframe/test_reduce.py
@@ -355,6 +355,12 @@ def test_sum_prod_specific(fn, min_count, numeric_only):
     )
 
 
+@pytest.mark.parametrize("backend", [None, "pyarrow"])
+def test_sum_prod_min_count(backend):
+    md_df, pd_df = create_test_dfs(test_data["float_nan_data"], backend=backend)
+    eval_general(md_df, pd_df, lambda df: df.prod(min_count=len(pd_df) + 1))
+
+
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_sum_single_column(data):
     modin_df = pd.DataFrame(data).iloc[:, [0]]
diff --git a/modin/tests/pandas/utils.py b/modin/tests/pandas/utils.py
index 1949097ce4f..2dd4346c814 100644
--- a/modin/tests/pandas/utils.py
+++ b/modin/tests/pandas/utils.py
@@ -662,7 +662,6 @@ def assert_dtypes_equal(df1, df2):
         lambda obj: isinstance(obj, pandas.PeriodDtype),
     )
 
-    # `test_pivot_table_margins` failed due to usage ``pd.NA`` in column name
     for idx in range(len(dtypes1)):
         for comparator in dtype_comparators:
             if assert_all_act_same(comparator, dtypes1.iloc[idx], dtypes2.iloc[idx]):

From acc20b34d3bedeae52c15fb1c290931cd2167f9e Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 14 May 2024 00:16:53 +0200
Subject: [PATCH 41/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../storage_formats/pandas/aggregations.py    |  1 +
 .../storage_formats/pandas/query_compiler.py  | 21 ++++++-------------
 modin/pandas/utils.py                         | 14 ++++++++++++-
 modin/tests/pandas/dataframe/test_default.py  | 15 ++++++-------
 4 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py
index 094b202700e..8af6dd40bfb 100644
--- a/modin/core/storage_formats/pandas/aggregations.py
+++ b/modin/core/storage_formats/pandas/aggregations.py
@@ -56,6 +56,7 @@ def corr_method(
             min_periods: int = 1,
             numeric_only: bool = True,
         ) -> PandasQueryCompiler:
+            # Further implementation is designed for the default pandas backend (numpy)
             if method != "pearson" or qc.get_backend() == "pyarrow":
                 return super(type(qc), qc).corr(
                     method=method, min_periods=min_periods, numeric_only=numeric_only
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 358ae635d47..8fb08a969cc 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -39,7 +39,6 @@
     is_datetime64_any_dtype,
     is_list_like,
     is_numeric_dtype,
-    is_timedelta64_dtype,
 )
 from pandas.core.groupby.base import transformation_kernels
 from pandas.core.indexes.api import ensure_index_from_sequences
@@ -1855,7 +1854,6 @@ def isin_func(df, values):
 
     abs = Map.register(pandas.DataFrame.abs, dtypes="copy")
     map = Map.register(pandas.DataFrame.map)
-    # Will it work with pyarrow backend?
     conj = Map.register(lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df)))
 
     def convert_dtypes(
@@ -1876,13 +1874,14 @@ def convert_dtypes(
             convert_floating=convert_floating,
             dtype_backend=dtype_backend,
         )
+        # TODO: `numpy_nullable` should be handled similar
         if dtype_backend == "pyarrow":
             result._modin_frame._pandas_backend = "pyarrow"
         return result
 
     invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy")
     isna = Map.register(pandas.DataFrame.isna, dtypes=np.bool_)
-    # better way to distinguish methods for NumPy API?
+    # TODO: better way to distinguish methods for NumPy API?
     _isfinite = Map.register(
         lambda df, *args, **kwargs: pandas.DataFrame(np.isfinite(df, *args, **kwargs)),
         dtypes=np.bool_,
@@ -2272,7 +2271,7 @@ def clip(self, lower, upper, **kwargs):
     corr = CorrCovBuilder.build_corr_method()
 
     def cov(self, min_periods=None, ddof=1):
-        if self._modin_frame._pandas_backend == "pyarrow":
+        if self.get_backend() == "pyarrow":
             return super().cov(min_periods=min_periods, ddof=ddof)
         # _nancorr use numpy which incompatible with pandas dataframes on pyarrow
         return self._nancorr(min_periods=min_periods, cov=True, ddof=ddof)
@@ -2642,11 +2641,7 @@ def quantile_for_list_of_values(self, **kwargs):
             new_columns = [
                 col
                 for col, dtype in zip(self.columns, self.dtypes)
-                if (
-                    is_numeric_dtype(dtype)
-                    or is_timedelta64_dtype(dtype)
-                    or is_datetime64_any_dtype(dtype)
-                )
+                if (is_numeric_dtype(dtype) or lib.is_np_dtype(dtype, "mM"))
             ]
         if axis == 1:
             query_compiler = self.getitem_column_array(new_columns)
@@ -2841,7 +2836,6 @@ def applyier(df, internal_indices, other=[], internal_other_indices=[]):
 
     # __getitem__ methods
     __getitem_bool = Binary.register(
-        # TODO: `is_scalar` don't work with pyarrow scalars
         lambda df, r: df[[r]] if is_scalar(r) else df[r],
         join_type="left",
         labels="drop",
@@ -4532,20 +4526,17 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item):
             pandas.DataFrame
                 Partition data with updated values.
             """
+            partition = partition.copy()
             try:
                 partition.iloc[row_internal_indices, col_internal_indices] = item
             except ValueError:
-                # maybe make a copy only if there is an exception?
-                partition = partition.copy()
                 # `copy` is needed to avoid "ValueError: buffer source array is read-only" for `item`
                 # because the item may be converted to the type that is in the dataframe.
                 # TODO: in the future we will need to convert to the correct type manually according
                 # to the following warning. Example: "FutureWarning: Setting an item of incompatible
                 # dtype is deprecated and will raise in a future error of pandas. Value '[1.38629436]'
                 # has dtype incompatible with int64, please explicitly cast to a compatible dtype first."
-                partition.iloc[row_internal_indices, col_internal_indices] = (
-                    item.copy() if hasattr(item, "copy") else item
-                )
+                partition.iloc[row_internal_indices, col_internal_indices] = item.copy()
             return partition
 
         if not is_scalar(item):
diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py
index 6039baa2c7b..19e7f4df1c3 100644
--- a/modin/pandas/utils.py
+++ b/modin/pandas/utils.py
@@ -119,6 +119,19 @@ def is_scalar(obj):
 
 
 def get_pandas_backend(dtypes: pandas.Series) -> str | None:
+    """
+    Determine the backend based on the `dtypes`.
+
+    Parameters
+    ----------
+    dtypes : pandas.Series
+        DataFrame dtypes.
+
+    Returns
+    -------
+    str | None
+        Backend name.
+    """
     backend = None
     if any(isinstance(x, pandas.ArrowDtype) for x in dtypes):
         backend = "pyarrow"
@@ -306,7 +319,6 @@ def broadcast_item(
     try:
         # Cast to numpy drop information about heterogeneous types (cast to common)
         # TODO: we shouldn't do that, maybe there should be the if branch
-        # TODO: what if item comes from pyarrow
         item = np.array(item)
         if dtypes is None:
             dtypes = pandas.Series([item.dtype] * len(col_lookup))
diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py
index 64fb650f10c..45ab3e2ec95 100644
--- a/modin/tests/pandas/dataframe/test_default.py
+++ b/modin/tests/pandas/dataframe/test_default.py
@@ -250,15 +250,16 @@ def test_combine_first():
 
 class TestCorr:
     @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"])
-    def test_corr(self, method):
+    @pytest.mark.parametrize("backend", [None, "pyarrow"])
+    def test_corr(self, method, backend):
         eval_general(
-            *create_test_dfs(test_data["int_data"]),
+            *create_test_dfs(test_data["int_data"], backend=backend),
             lambda df: df.corr(method=method),
         )
         # Modin result may slightly differ from pandas result
         # due to floating pointing arithmetic.
         eval_general(
-            *create_test_dfs(test_data["float_nan_data"]),
+            *create_test_dfs(test_data["float_nan_data"], backend=backend),
             lambda df: df.corr(method=method),
             comparator=modin_df_almost_equals_pandas,
         )
@@ -352,7 +353,8 @@ def test_corr_nans_in_different_partitions(self):
 
 @pytest.mark.parametrize("min_periods", [1, 3, 5], ids=lambda x: f"min_periods={x}")
 @pytest.mark.parametrize("ddof", [1, 2, 4], ids=lambda x: f"ddof={x}")
-def test_cov(min_periods, ddof):
+@pytest.mark.parametrize("backend", [None, "pyarrow"])
+def test_cov(min_periods, ddof, backend):
     # Modin result may slightly differ from pandas result
     # due to floating pointing arithmetic.
     if StorageFormat.get() == "Hdk":
@@ -366,13 +368,13 @@ def comparator1(df1, df2):
         comparator2 = modin_df_almost_equals_pandas
 
     eval_general(
-        *create_test_dfs(test_data["int_data"]),
+        *create_test_dfs(test_data["int_data"], backend=backend),
         lambda df: df.cov(min_periods=min_periods, ddof=ddof),
         comparator=comparator1,
     )
 
     eval_general(
-        *create_test_dfs(test_data["float_nan_data"]),
+        *create_test_dfs(test_data["float_nan_data"], backend=backend),
         lambda df: df.cov(min_periods=min_periods),
         comparator=comparator2,
     )
@@ -634,7 +636,6 @@ def test_pivot(data, index, columns, values, request):
         expected_exception = ValueError(
             "Index contains duplicate entries, cannot reshape"
         )
-    # failed because pandas doesn't preserve dtype backend
     eval_general(
         *create_test_dfs(data),
         lambda df, *args, **kwargs: df.pivot(*args, **kwargs),

From 7a91fc451a141a4e562318de30f4413fb22d8f09 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 14 May 2024 01:37:13 +0200
Subject: [PATCH 42/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../dataframe/pandas/dataframe/dataframe.py   | 13 ++++++------
 .../pandas/partitioning/partition_manager.py  |  7 +++----
 .../storage_formats/base/query_compiler.py    | 20 ++++++++++++++-----
 .../storage_formats/pandas/query_compiler.py  |  4 ----
 modin/pandas/base.py                          |  2 +-
 modin/utils.py                                |  3 +++
 6 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 06accd0884e..273fe8bf22d 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -1200,7 +1200,7 @@ def _take_2d_positional(
                     + f"received: {type(indexer)}",
                 )
                 if isinstance(indexer, list):
-                    indexer = np.array(indexer, dtype="int64")
+                    indexer = np.array(indexer, dtype=np.int64)
             indexers.append(indexer)
         row_positions, col_positions = indexers
 
@@ -1760,7 +1760,6 @@ def astype_builder(df):
             new_frame = self._partition_mgr_cls.lazy_map_partitions(
                 self._partitions, astype_builder
             )
-
         return self.__constructor__(
             new_frame,
             self.copy_index_cache(copy_lengths=True),
@@ -1881,13 +1880,13 @@ def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False):
                         return dict_of_slices
         if isinstance(indices, list):
             # Converting python list to numpy for faster processing
-            indices = np.array(indices, dtype="int64")
+            indices = np.array(indices, dtype=np.int64)
         # Fasttrack empty numpy array
         if isinstance(indices, np.ndarray) and indices.size == 0:
             # This will help preserve metadata stored in empty dataframes (indexes and dtypes)
             # Otherwise, we will get an empty `new_partitions` array, from which it will
             #  no longer be possible to obtain metadata
-            return dict([(0, np.array([], dtype="int64"))])
+            return dict([(0, np.array([], dtype=np.int64))])
         negative_mask = np.less(indices, 0)
         has_negative = np.any(negative_mask)
         if has_negative:
@@ -1895,7 +1894,7 @@ def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False):
             indices = (
                 indices.copy()
                 if isinstance(indices, np.ndarray)
-                else np.array(indices, dtype="int64")
+                else np.array(indices, dtype=np.int64)
             )
             indices[negative_mask] = indices[negative_mask] % len(self.get_axis(axis))
         # If the `indices` array was modified because of the negative indices conversion
@@ -4585,7 +4584,7 @@ def from_pandas(cls, df):
         new_index = df.index
         new_columns = df.columns
         new_dtypes = df.dtypes
-        new_frame, new_lengths, new_widths, pandas_backend = (
+        new_frame, pandas_backend, new_lengths, new_widths = (
             cls._partition_mgr_cls.from_pandas(df, True)
         )
         return cls(
@@ -4613,7 +4612,7 @@ def from_arrow(cls, at):
         PandasDataframe
             New Modin DataFrame.
         """
-        new_frame, new_lengths, new_widths, pandas_backend = (
+        new_frame, pandas_backend, new_lengths, new_widths = (
             cls._partition_mgr_cls.from_arrow(at, return_dims=True)
         )
         new_columns = Index.__new__(Index, data=at.column_names, dtype="O")
diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py
index 0f2e99cfb22..c4a8afa47b2 100644
--- a/modin/core/dataframe/pandas/partitioning/partition_manager.py
+++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py
@@ -946,7 +946,7 @@ def from_pandas(cls, df, return_dims=False):
 
         Returns
         -------
-        np.ndarray or (np.ndarray, row_lengths, col_widths)
+        (np.ndarray, backend) or (np.ndarray, backend, row_lengths, col_widths)
             A NumPy array with partitions (with dimensions or not).
         """
         num_splits = NPartitions.get()
@@ -1008,7 +1008,7 @@ def update_bar(f):
                 )
                 for i in range(0, len(df.columns), col_chunksize)
             ]
-            return parts, row_lengths, col_widths, backend
+            return parts, backend, row_lengths, col_widths
 
     @classmethod
     def from_arrow(cls, at, return_dims=False):
@@ -1025,10 +1025,9 @@ def from_arrow(cls, at, return_dims=False):
 
         Returns
         -------
-        np.ndarray or (np.ndarray, row_lengths, col_widths)
+        (np.ndarray, backend) or (np.ndarray, backend, row_lengths, col_widths)
             A NumPy array with partitions (with dimensions or not).
         """
-        # also return backend
         return cls.from_pandas(at.to_pandas(), return_dims=return_dims)
 
     @classmethod
diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
index b4bd72ce0e0..662ce2c9dd1 100644
--- a/modin/core/storage_formats/base/query_compiler.py
+++ b/modin/core/storage_formats/base/query_compiler.py
@@ -1116,7 +1116,7 @@ def merge_asof(
         tolerance=None,
         allow_exact_matches: bool = True,
         direction: str = "backward",
-    ):
+    ):  # noqa: GL08
         # Pandas fallbacks for tricky cases:
         if (
             # No idea how this works or why it does what it does; and in fact
@@ -3620,7 +3620,9 @@ def groupby_fillna(
             drop=drop,
         )
 
-    def groupby_diff(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False):
+    def groupby_diff(
+        self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False
+    ):  # noqa: GL08
         return self.groupby_agg(
             by=by,
             agg_func="diff",
@@ -3633,7 +3635,7 @@ def groupby_diff(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=Fals
 
     def groupby_pct_change(
         self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False
-    ):
+    ):  # noqa: GL08
         return self.groupby_agg(
             by=by,
             agg_func="pct_change",
@@ -3941,7 +3943,7 @@ def groupby_ohlc(
         agg_args,
         agg_kwargs,
         is_df,
-    ):
+    ):  # noqa: GL08
         if not is_df:
             return self.groupby_agg(
                 by=by,
@@ -4605,7 +4607,7 @@ def shift(
         freq,
         axis,
         fill_value,
-    ):
+    ):  # noqa: GL08
         return DataFrameDefault.register(pandas.DataFrame.shift)(
             self, periods, freq, axis, fill_value
         )
@@ -6756,6 +6758,14 @@ def case_when(self, caselist):  # noqa: PR01, RT01, D200
         return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist)
 
     def get_backend(self) -> Optional[str]:
+        """
+        Get backend stored in `_modin_frame`.
+
+        Returns
+        -------
+        str | None
+            Backend name.
+        """
         return self._modin_frame._pandas_backend
 
     def repartition(self, axis=None):
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 8fb08a969cc..f8a0964079d 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -4602,10 +4602,6 @@ def sort_columns_by_row_values(self, rows, ascending=True, **kwargs):
     def cat_codes(self):
         def func(df: pandas.DataFrame) -> pandas.DataFrame:
             ser = df.iloc[:, 0]
-            if not isinstance(ser.dtype, pandas.CategoricalDtype):
-                raise TypeError(
-                    f"Series dtype should be `CategoricalDtype`: actual dtype: {ser.dtype}"
-                )
             return ser.cat.codes.to_frame(name=MODIN_UNNAMED_SERIES_LABEL)
 
         res = self._modin_frame.map(func=func, new_columns=[MODIN_UNNAMED_SERIES_LABEL])
diff --git a/modin/pandas/base.py b/modin/pandas/base.py
index 16cdc1bdec2..746ed2ec9fa 100644
--- a/modin/pandas/base.py
+++ b/modin/pandas/base.py
@@ -364,7 +364,7 @@ def _validate_other(
             other_dtypes = [other.dtype] * len(other)
         elif is_dict_like(other):
             other_dtypes = [
-                type(other[label])
+                other[label] if pandas.isna(other[label]) else type(other[label])
                 for label in self._get_axis(axis)
                 # The binary operation is applied for intersection of axis labels
                 # and dictionary keys. So filtering out extra keys.
diff --git a/modin/utils.py b/modin/utils.py
index a3ed1dc91a3..ee72620fa7b 100644
--- a/modin/utils.py
+++ b/modin/utils.py
@@ -310,6 +310,9 @@ def _replace_doc(
     target_doc = target_obj.__doc__ or ""
     overwrite = overwrite or not target_doc
     doc = source_doc if overwrite else target_doc
+    if doc == "":
+        # Empty docstrings do not need to be inherited
+        return
 
     if parent_cls and not attr_name:
         if isinstance(target_obj, property):

From d31e93f497844c1b38a8478088a29f4644d4307a Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 14 May 2024 01:38:54 +0200
Subject: [PATCH 43/50] revert changes in metadata/dtypes.py

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/pandas/metadata/dtypes.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py
index ceb205ef74a..96f06aa757f 100644
--- a/modin/core/dataframe/pandas/metadata/dtypes.py
+++ b/modin/core/dataframe/pandas/metadata/dtypes.py
@@ -528,11 +528,7 @@ def _merge_dtypes(
         def combine_dtypes(row):
             if (row == "unknown").any():
                 return "unknown"
-            if any("pyarrow" in str(x) for x in row):
-                # nans can be stored not only in float types, for example in `bool[pyarrow]`
-                row = row[~row.isna()]
-            else:
-                row = row.fillna(pandas.api.types.pandas_dtype("float"))
+            row = row.fillna(pandas.api.types.pandas_dtype("float"))
             return find_common_type(list(row.values))
 
         dtypes = dtypes_matrix.apply(combine_dtypes, axis=1)

From b3179fcd9d0313e85e51c8a92853f788b959897b Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 14 May 2024 01:45:18 +0200
Subject: [PATCH 44/50] fix tests

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/pandas/dataframe/dataframe.py |  2 ++
 modin/tests/pandas/test_series.py                  | 14 ++++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 23bd9f4428a..bbf3af12ef9 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -101,6 +101,8 @@ class PandasDataframe(
         each of the block partitions. Is computed if not provided.
     dtypes : pandas.Series or callable, optional
         The data types for the dataframe columns.
+    pandas_backend : {"pyarrow", None}, optional
+        Backend used by pandas.
     """
 
     _partition_mgr_cls: PandasDataframePartitionManager
diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py
index dd8dbe85da9..4c45102ede5 100644
--- a/modin/tests/pandas/test_series.py
+++ b/modin/tests/pandas/test_series.py
@@ -1419,12 +1419,14 @@ def comparator(df1, df2):
             comparator=comparator,
         )
 
-    eval_general(
-        modin_series,
-        pandas_series,
-        lambda ser: ser > (ser + 1),
-        comparator=comparator,
-    )
+    if StorageFormat.get() != "Hdk":
+        # FIXME: HDK should also work in this case
+        eval_general(
+            modin_series,
+            pandas_series,
+            lambda ser: ser > (ser + 1),
+            comparator=comparator,
+        )
 
     eval_general(
         modin_series,

From b3471ff7e921454512381f9c9eb267a2777c04dc Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 14 May 2024 13:05:05 +0200
Subject: [PATCH 45/50] cleanup

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../pandas_on_dask/dataframe/dataframe.py            |  2 ++
 .../pandas_on_python/dataframe/dataframe.py          |  2 ++
 .../pandas_on_ray/dataframe/dataframe.py             |  2 ++
 .../pandas_on_unidist/dataframe/dataframe.py         |  2 ++
 modin/core/storage_formats/base/query_compiler.py    | 12 +++++-------
 modin/utils.py                                       |  3 ---
 6 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py b/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py
index 0920d963840..5e4598d0ddf 100644
--- a/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py
+++ b/modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py
@@ -38,6 +38,8 @@ class PandasOnDaskDataframe(PandasDataframe):
         each of the block partitions. Is computed if not provided.
     dtypes : pandas.Series, optional
         The data types for the dataframe columns.
+    pandas_backend : {"pyarrow", None}, optional
+        Backend used by pandas. None - means default NumPy backend.
     """
 
     _partition_mgr_cls = PandasOnDaskDataframePartitionManager
diff --git a/modin/core/execution/python/implementations/pandas_on_python/dataframe/dataframe.py b/modin/core/execution/python/implementations/pandas_on_python/dataframe/dataframe.py
index 6e314beaa9c..0e2bc70d995 100644
--- a/modin/core/execution/python/implementations/pandas_on_python/dataframe/dataframe.py
+++ b/modin/core/execution/python/implementations/pandas_on_python/dataframe/dataframe.py
@@ -45,6 +45,8 @@ class PandasOnPythonDataframe(PandasDataframe):
         each of the block partitions. Is computed if not provided.
     dtypes : pandas.Series, optional
         The data types for the dataframe columns.
+    pandas_backend : {"pyarrow", None}, optional
+        Backend used by pandas. None - means default NumPy backend.
     """
 
     _partition_mgr_cls = PandasOnPythonDataframePartitionManager
diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/dataframe/dataframe.py b/modin/core/execution/ray/implementations/pandas_on_ray/dataframe/dataframe.py
index 6838fd9edca..373a84ecdb4 100644
--- a/modin/core/execution/ray/implementations/pandas_on_ray/dataframe/dataframe.py
+++ b/modin/core/execution/ray/implementations/pandas_on_ray/dataframe/dataframe.py
@@ -39,6 +39,8 @@ class PandasOnRayDataframe(PandasDataframe):
         each of the block partitions. Is computed if not provided.
     dtypes : pandas.Series, optional
         The data types for the dataframe columns.
+    pandas_backend : {"pyarrow", None}, optional
+        Backend used by pandas. None - means default NumPy backend.
     """
 
     _partition_mgr_cls = PandasOnRayDataframePartitionManager
diff --git a/modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/dataframe.py b/modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/dataframe.py
index 3241e9299e8..9adba6bc6dc 100644
--- a/modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/dataframe.py
+++ b/modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/dataframe.py
@@ -38,6 +38,8 @@ class PandasOnUnidistDataframe(PandasDataframe):
         each of the block partitions. Is computed if not provided.
     dtypes : pandas.Series, optional
         The data types for the dataframe columns.
+    pandas_backend : {"pyarrow", None}, optional
+        Backend used by pandas. None - means default NumPy backend.
     """
 
     _partition_mgr_cls = PandasOnUnidistDataframePartitionManager
diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
index df21d021b82..25c38929014 100644
--- a/modin/core/storage_formats/base/query_compiler.py
+++ b/modin/core/storage_formats/base/query_compiler.py
@@ -1118,7 +1118,7 @@ def merge_asof(
         tolerance=None,
         allow_exact_matches: bool = True,
         direction: str = "backward",
-    ):  # noqa: GL08
+    ):
         # Pandas fallbacks for tricky cases:
         if (
             # No idea how this works or why it does what it does; and in fact
@@ -3622,9 +3622,7 @@ def groupby_fillna(
             drop=drop,
         )
 
-    def groupby_diff(
-        self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False
-    ):  # noqa: GL08
+    def groupby_diff(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False):
         return self.groupby_agg(
             by=by,
             agg_func="diff",
@@ -3637,7 +3635,7 @@ def groupby_diff(
 
     def groupby_pct_change(
         self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False
-    ):  # noqa: GL08
+    ):
         return self.groupby_agg(
             by=by,
             agg_func="pct_change",
@@ -3945,7 +3943,7 @@ def groupby_ohlc(
         agg_args,
         agg_kwargs,
         is_df,
-    ):  # noqa: GL08
+    ):
         if not is_df:
             return self.groupby_agg(
                 by=by,
@@ -4609,7 +4607,7 @@ def shift(
         freq,
         axis,
         fill_value,
-    ):  # noqa: GL08
+    ):
         return DataFrameDefault.register(pandas.DataFrame.shift)(
             self, periods, freq, axis, fill_value
         )
diff --git a/modin/utils.py b/modin/utils.py
index ee72620fa7b..a3ed1dc91a3 100644
--- a/modin/utils.py
+++ b/modin/utils.py
@@ -310,9 +310,6 @@ def _replace_doc(
     target_doc = target_obj.__doc__ or ""
     overwrite = overwrite or not target_doc
     doc = source_doc if overwrite else target_doc
-    if doc == "":
-        # Empty docstrings do not need to be inherited
-        return
 
     if parent_cls and not attr_name:
         if isinstance(target_obj, property):

From 14b4dd3274d30e8bfb4fb4894c512f381f3db6db Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 14 May 2024 13:22:28 +0200
Subject: [PATCH 46/50] fix

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .../ray/implementations/cudf_on_ray/dataframe/dataframe.py      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modin/core/execution/ray/implementations/cudf_on_ray/dataframe/dataframe.py b/modin/core/execution/ray/implementations/cudf_on_ray/dataframe/dataframe.py
index a40d10cd4c1..b4fef2ed18a 100644
--- a/modin/core/execution/ray/implementations/cudf_on_ray/dataframe/dataframe.py
+++ b/modin/core/execution/ray/implementations/cudf_on_ray/dataframe/dataframe.py
@@ -50,6 +50,8 @@ class cuDFOnRayDataframe(PandasOnRayDataframe):
         each of the block partitions. Is computed if not provided.
     dtypes : pandas.Series, optional
         The data types for the dataframe columns.
+    pandas_backend : {"pyarrow", None}, optional
+        Backend used by pandas. None - means default NumPy backend.
     """
 
     _partition_mgr_cls = cuDFOnRayDataframePartitionManager

From 7abfc427bf359c98c87ea94883af2f5a8f13f42b Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoliimyachev@mail.com>
Date: Tue, 14 May 2024 15:54:20 +0200
Subject: [PATCH 47/50] Apply suggestions from code review

Co-authored-by: Iaroslav Igoshev <Poolliver868@mail.ru>
---
 modin/core/dataframe/pandas/dataframe/dataframe.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 0d3cffdbd59..cd0e75ed861 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -143,13 +143,13 @@ def __init__(
         self._row_lengths_cache = row_lengths
         self._column_widths_cache = column_widths
         self._pandas_backend = pandas_backend
-        if not pandas_backend == "pyarrow":
+        if pandas_backend != "pyarrow":
+            self.set_dtypes_cache(dtypes)
+        else:
             # In this case, the type precomputation may be incorrect; we need
             # to know the type algebra precisely. Considering the number of operations
             # and different combinations of backends, the best solution would be to
             # introduce optimizations gradually, with a large number of tests.
-            self.set_dtypes_cache(dtypes)
-        else:
             self.set_dtypes_cache(None)
 
         self._validate_axes_lengths()

From 30d47494f6d5108a32e1ea77523fd7138f0274fb Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoliimyachev@mail.com>
Date: Tue, 14 May 2024 15:56:45 +0200
Subject: [PATCH 48/50] Apply suggestions from code review

Co-authored-by: Iaroslav Igoshev <Poolliver868@mail.ru>
---
 modin/tests/pandas/test_series.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py
index 4c45102ede5..be737e4c70d 100644
--- a/modin/tests/pandas/test_series.py
+++ b/modin/tests/pandas/test_series.py
@@ -1420,7 +1420,8 @@ def comparator(df1, df2):
         )
 
     if StorageFormat.get() != "Hdk":
-        # FIXME: HDK should also work in this case
+        # FIXME: HDK should also work in this case but
+        # since we deprecated it, we will just remove this branch
         eval_general(
             modin_series,
             pandas_series,

From 3213194b2733f7efe74225a5a0bf671b7aa4ef15 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 14 May 2024 16:13:34 +0200
Subject: [PATCH 49/50] address review comments

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/storage_formats/base/query_compiler.py   | 2 +-
 modin/core/storage_formats/pandas/aggregations.py   | 2 +-
 modin/core/storage_formats/pandas/query_compiler.py | 2 +-
 modin/pandas/dataframe.py                           | 8 ++++----
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
index 25c38929014..50008f261a4 100644
--- a/modin/core/storage_formats/base/query_compiler.py
+++ b/modin/core/storage_formats/base/query_compiler.py
@@ -6757,7 +6757,7 @@ def case_when(self, caselist):  # noqa: PR01, RT01, D200
         ]
         return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist)
 
-    def get_backend(self) -> Optional[str]:
+    def get_pandas_backend(self) -> Optional[str]:
         """
         Get backend stored in `_modin_frame`.
 
diff --git a/modin/core/storage_formats/pandas/aggregations.py b/modin/core/storage_formats/pandas/aggregations.py
index 8af6dd40bfb..3959b86f3ab 100644
--- a/modin/core/storage_formats/pandas/aggregations.py
+++ b/modin/core/storage_formats/pandas/aggregations.py
@@ -57,7 +57,7 @@ def corr_method(
             numeric_only: bool = True,
         ) -> PandasQueryCompiler:
             # Further implementation is designed for the default pandas backend (numpy)
-            if method != "pearson" or qc.get_backend() == "pyarrow":
+            if method != "pearson" or qc.get_pandas_backend() == "pyarrow":
                 return super(type(qc), qc).corr(
                     method=method, min_periods=min_periods, numeric_only=numeric_only
                 )
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 4f55159aa60..0014f4992ef 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -2284,7 +2284,7 @@ def clip(self, lower, upper, **kwargs):
     corr = CorrCovBuilder.build_corr_method()
 
     def cov(self, min_periods=None, ddof=1):
-        if self.get_backend() == "pyarrow":
+        if self.get_pandas_backend() == "pyarrow":
             return super().cov(min_periods=min_periods, ddof=ddof)
         # _nancorr use numpy which incompatible with pandas dataframes on pyarrow
         return self._nancorr(min_periods=min_periods, cov=True, ddof=ddof)
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index d5b15f5be38..7cbfc2634d9 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -1622,8 +1622,8 @@ def prod(
             skipna is not False
             and numeric_only is False
             and min_count > len(axis_to_apply)
-            # Type inference is not so simple for pyarrow
-            and self._query_compiler.get_backend() is None
+            # This fast path is only suitable for the default backend
+            and self._query_compiler.get_pandas_backend() is None
         ):
             new_index = self.columns if not axis else self.index
             # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10)
@@ -2152,8 +2152,8 @@ def sum(
             skipna is not False
             and numeric_only is False
             and min_count > len(axis_to_apply)
-            # Type inference is not so simple for pyarrow
-            and self._query_compiler.get_backend() is None
+            # This fast path is only suitable for the default backend
+            and self._query_compiler.get_pandas_backend() is None
         ):
             new_index = self.columns if not axis else self.index
             return Series(

From 45acef9b2b6f1c3f2f9611db16549350ee30ffaf Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 14 May 2024 19:56:08 +0200
Subject: [PATCH 50/50] expand comments

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/dataframe/pandas/dataframe/dataframe.py | 4 ++++
 modin/core/storage_formats/pandas/groupby.py       | 1 +
 modin/tests/pandas/dataframe/test_binary.py        | 2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index cd0e75ed861..04952c00f45 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -3985,6 +3985,7 @@ def _compute_new_widths():
             new_columns = joined_index
             frames = [self] + others
             # TODO: should we wrap all `concat` call into "try except" block?
+            # `ModinDtypes.concat` can throw exception in case of duplicate values
             new_dtypes = ModinDtypes.concat([frame._dtypes for frame in frames], axis=1)
             # If we have already cached the length of each row in at least one
             # of the row's partitions, we can build new_lengths for the new
@@ -4621,6 +4622,9 @@ def _arrow_type_to_dtype(cls, arrow_type):
 
         try:
             # TODO: should we map arrow types to pyarrow-backed pandas types?
+            # It seems like this might help avoid the expense of transferring
+            # data between backends (numpy and pyarrow), but we need to be sure
+            # how this fits into the type inference system in pandas.
             res = arrow_type.to_pandas_dtype()
         # Conversion to pandas is not implemented for some arrow types,
         # perform manual conversion for them:
diff --git a/modin/core/storage_formats/pandas/groupby.py b/modin/core/storage_formats/pandas/groupby.py
index 4b22b5c0158..55de645a898 100644
--- a/modin/core/storage_formats/pandas/groupby.py
+++ b/modin/core/storage_formats/pandas/groupby.py
@@ -360,6 +360,7 @@ def applyier(df, other):  # pragma: no cover
             # different partitions
             if len(index) == 0 and len(columns) > 0:
                 common_type = find_common_type(result.dtypes.tolist())
+                # TODO: remove find_common_type+astype after pandas fix the following issue
                 # transpose loses dtypes: https://github.com/pandas-dev/pandas/issues/43337
                 result = result.transpose().astype(common_type, copy=False)
 
diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py
index 9a72cd9d0dc..10dabbe32bc 100644
--- a/modin/tests/pandas/dataframe/test_binary.py
+++ b/modin/tests/pandas/dataframe/test_binary.py
@@ -87,7 +87,7 @@ def test_math_functions(other, axis, op, backend):
         pytest.xfail(reason="different behavior")
 
     if op in ("mod", "rmod") and backend == "pyarrow":
-        pytest.skip(reason="Not implemented for pyarrow backend")
+        pytest.skip(reason="These functions are not implemented in pandas itself")
     eval_general(
         *create_test_dfs(data, backend=backend),
         lambda df: getattr(df, op)(other(df, axis), axis=axis),