From fbebef726f80282b151971849d0b2bc0246c9b58 Mon Sep 17 00:00:00 2001 From: Wenjun Si Date: Thu, 17 Feb 2022 06:26:05 +0800 Subject: [PATCH] Support inclusive argument for pd.date_range (#2718) --- .../arithmetic/tests/test_arithmetic.py | 44 +++-- mars/dataframe/core.py | 12 +- mars/dataframe/datasource/date_range.py | 183 ++++++++---------- .../tests/test_datasource_execution.py | 44 +++-- mars/dataframe/indexing/set_index.py | 2 +- mars/dataframe/sort/psrs.py | 2 +- mars/dataframe/sort/sort_values.py | 3 +- mars/dataframe/tests/test_utils.py | 24 +-- mars/dataframe/utils.py | 4 +- mars/learn/ensemble/_bagging.py | 4 +- .../scheduling/worker/tests/test_execution.py | 2 +- mars/tensor/statistics/bincount.py | 2 +- 12 files changed, 185 insertions(+), 141 deletions(-) diff --git a/mars/dataframe/arithmetic/tests/test_arithmetic.py b/mars/dataframe/arithmetic/tests/test_arithmetic.py index 49d51f05e1..ff81c3fa1c 100644 --- a/mars/dataframe/arithmetic/tests/test_arithmetic.py +++ b/mars/dataframe/arithmetic/tests/test_arithmetic.py @@ -176,7 +176,9 @@ def test_without_shuffle(func_name, func_opts): assert df3.columns_value.should_be_monotonic is True assert isinstance(df3.index_value.value, IndexValue.Int64Index) assert df3.index_value.should_be_monotonic is True - pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert df3.index_value.key != df1.index_value.key assert df3.index_value.key != df2.index_value.key assert df3.shape[1] == 11 # columns is recorded, so we can get it @@ -190,7 +192,9 @@ def test_without_shuffle(func_name, func_opts): assert df3.columns_value.should_be_monotonic is True assert isinstance(df3.index_value.value, IndexValue.Int64Index) assert df3.index_value.should_be_monotonic is True - pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert df3.index_value.key != df1.index_value.key assert df3.index_value.key != df2.index_value.key assert df3.shape[1] == 11 # columns is recorded, so we can get it @@ -408,7 +412,9 @@ def test_dataframe_and_series_with_shuffle(func_name, func_opts): # test df2's index and columns assert df2.shape == (df1.shape[0], np.nan) assert df2.index_value.key == df1.index_value.key - pd.testing.assert_index_equal(df2.columns_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert df2.columns_value.key != df1.columns_value.key assert df2.columns_value.should_be_monotonic is True @@ -602,7 +608,9 @@ def test_series_and_series_with_shuffle(func_name, func_opts): assert s3.shape == (np.nan,) assert s3.index_value.key != s1.index_value.key assert s3.index_value.key != s2.index_value.key - pd.testing.assert_index_equal(s3.index_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + s3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert s3.index_value.should_be_monotonic is True s1, s2, s3 = tile(s1, s2, s3) @@ -726,7 +734,9 @@ def test_with_one_shuffle(func_name, func_opts): assert df3.columns_value.should_be_monotonic is True assert isinstance(df3.index_value.value, IndexValue.Int64Index) assert df3.index_value.should_be_monotonic is True - pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert df3.index_value.key != df1.index_value.key assert df3.index_value.key != df2.index_value.key assert df3.shape[1] == 12 # columns is recorded, so we can get it @@ -858,7 +868,9 @@ def test_with_all_shuffle(func_name, func_opts): assert df3.columns_value.should_be_monotonic is True assert isinstance(df3.index_value.value, IndexValue.Int64Index) assert df3.index_value.should_be_monotonic is True - pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert df3.index_value.key != df1.index_value.key assert df3.index_value.key != df2.index_value.key assert df3.shape[1] == 12 # columns is recorded, so we can get it @@ -958,7 +970,9 @@ def test_with_all_shuffle(func_name, func_opts): assert df6.columns_value.should_be_monotonic is True assert isinstance(df6.index_value.value, IndexValue.Int64Index) assert df6.index_value.should_be_monotonic is True - pd.testing.assert_index_equal(df6.index_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + df6.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert df6.index_value.key != df4.index_value.key assert df6.index_value.key != df5.index_value.key assert df6.shape[1] == 20 # columns is recorded, so we can get it @@ -1063,7 +1077,9 @@ def test_without_shuffle_and_with_one_chunk(func_name, func_opts): assert df3.columns_value.should_be_monotonic is True assert isinstance(df3.index_value.value, IndexValue.Int64Index) assert df3.index_value.should_be_monotonic is True - pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert df3.index_value.key != df1.index_value.key assert df3.index_value.key != df2.index_value.key assert df3.shape[1] == 12 # columns is recorded, so we can get it @@ -1175,7 +1191,9 @@ def test_both_one_chunk(func_name, func_opts): assert df3.columns_value.should_be_monotonic is True assert isinstance(df3.index_value.value, IndexValue.Int64Index) assert df3.index_value.should_be_monotonic is True - pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert df3.index_value.key != df1.index_value.key assert df3.index_value.key != df2.index_value.key assert df3.shape[1] == 12 # columns is recorded, so we can get it @@ -1219,7 +1237,9 @@ def test_with_shuffle_and_one_chunk(func_name, func_opts): assert df3.columns_value.should_be_monotonic is True assert isinstance(df3.index_value.value, IndexValue.Int64Index) assert df3.index_value.should_be_monotonic is True - pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert df3.index_value.key != df1.index_value.key assert df3.index_value.key != df2.index_value.key assert df3.shape[1] == 12 # columns is recorded, so we can get it @@ -1312,7 +1332,9 @@ def test_on_same_dataframe(func_name, func_opts): assert df2.columns_value.should_be_monotonic is False assert isinstance(df2.index_value.value, IndexValue.Int64Index) assert df2.index_value.should_be_monotonic is False - pd.testing.assert_index_equal(df2.index_value.to_pandas(), pd.Int64Index([])) + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) assert df2.index_value.key == df.index_value.key assert df2.columns_value.key == df.columns_value.key assert df2.shape[1] == 10 diff --git a/mars/dataframe/core.py b/mars/dataframe/core.py index f12a5cde1b..1bb23c8393 100644 --- a/mars/dataframe/core.py +++ b/mars/dataframe/core.py @@ -138,7 +138,11 @@ def to_pandas(self): kw = {k: v for k, v in kw.items() if v is not None} if kw.get("data") is None: kw["data"] = [] - return getattr(pd, type(self).__name__)(**kw) + + pd_initializer = getattr(self, "_pd_initializer", None) + if pd_initializer is None: + pd_initializer = getattr(pd, type(self).__name__) + return pd_initializer(**kw) class Index(IndexBase): _name = AnyField("name") @@ -240,6 +244,8 @@ def inferred_type(self): return "period" class Int64Index(IndexBase): + _pd_initializer = pd.Index + _name = AnyField("name") _data = NDArrayField("data") _dtype = DataTypeField("dtype") @@ -249,6 +255,8 @@ def inferred_type(self): return "integer" class UInt64Index(IndexBase): + _pd_initializer = pd.Index + _name = AnyField("name") _data = NDArrayField("data") _dtype = DataTypeField("dtype") @@ -258,6 +266,8 @@ def inferred_type(self): return "integer" class Float64Index(IndexBase): + _pd_initializer = pd.Index + _name = AnyField("name") _data = NDArrayField("data") _dtype = DataTypeField("dtype") diff --git a/mars/dataframe/datasource/date_range.py b/mars/dataframe/datasource/date_range.py index eec994a8a4..c0665cf843 100644 --- a/mars/dataframe/datasource/date_range.py +++ b/mars/dataframe/datasource/date_range.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings from datetime import datetime, date, time import numpy as np @@ -25,6 +26,7 @@ from ...core import OutputType from ...serialization.serializables import AnyField, Int64Field, BoolField, StringField from ...tensor.utils import decide_chunk_sizes +from ...utils import pd_release_version, NoDefault from ..operands import DataFrameOperand, DataFrameOperandMixin from ..utils import parse_index @@ -46,116 +48,70 @@ def normalize_date(dt): # from pandas/_libs/tslibs/conversion.pyx raise TypeError(f"Unrecognized type: {type(dt)}") +_date_range_use_inclusive = pd_release_version[:2] >= (1, 4) + + class DataFrameDateRange(DataFrameOperand, DataFrameOperandMixin): _op_type_ = OperandDef.DATE_RANGE - _start = AnyField("start") - _end = AnyField("end") - _periods = Int64Field("periods") - _freq = AnyField("freq") - _tz = AnyField("tz") - _normalize = BoolField("normalize") - _name = StringField("name") - _closed = StringField("closed") + start = AnyField("start") + end = AnyField("end") + periods = Int64Field("periods") + freq = AnyField("freq") + tz = AnyField("tz") + normalize = BoolField("normalize") + name = StringField("name") + inclusive = StringField("inclusive") def __init__( self, - start=None, - end=None, - periods=None, - freq=None, - tz=None, - normalize=None, - name=None, - closed=None, output_types=None, **kw, ): - super().__init__( - _start=start, - _end=end, - _periods=periods, - _freq=freq, - _tz=tz, - _normalize=normalize, - _name=name, - _closed=closed, - _output_types=output_types, - **kw, - ) + super().__init__(_output_types=output_types, **kw) if self.output_types is None: self.output_types = [OutputType.index] - - @property - def start(self): - return self._start - - @property - def end(self): - return self._end - - @property - def periods(self): - return self._periods - - @property - def freq(self): - return self._freq - - @property - def tz(self): - return self._tz - - @property - def normalize(self): - return self._normalize - - @property - def name(self): - return self._name - - @property - def closed(self): - return self._closed + if getattr(self, "inclusive", None) is None: + self.inclusive = "both" def __call__(self, shape, chunk_size=None): - dtype = pd.Index([self._start]).dtype + dtype = pd.Index([self.start]).dtype index_value = parse_index( - pd.Index([], dtype=dtype), self._start, self._end, self._periods, self._tz + pd.Index([], dtype=dtype), self.start, self.end, self.periods, self.tz ) # gen index value info - index_value.value._min_val = self._start + index_value.value._min_val = self.start index_value.value._min_val_close = True - index_value.value._max_val = self._end + index_value.value._max_val = self.end index_value.value._max_val_close = True index_value.value._is_unique = True index_value.value._is_monotonic_increasing = True - index_value.value._freq = self._freq + index_value.value._freq = self.freq return self.new_index( None, shape=shape, dtype=dtype, index_value=index_value, - name=self._name, + name=self.name, raw_chunk_size=chunk_size, - freq=self._freq, + freq=self.freq, ) @classmethod - def tile(cls, op): + def tile(cls, op: "DataFrameDateRange"): out = op.outputs[0] start = op.start end = op.end freq = op.freq periods = op.periods - closed = op.closed + inclusive = op.inclusive chunk_length = out.extra_params.raw_chunk_size or options.chunk_size chunk_length = decide_chunk_sizes(out.shape, chunk_length, out.dtype.itemsize)[ 0 ] - if closed == "right": + if inclusive in ("neither", "right"): # if left not close, add one more for the first chunk chunk_length = (chunk_length[0] + 1,) + chunk_length[1:] @@ -169,15 +125,20 @@ def tile(cls, op): cum_nsplit = [0] + np.cumsum(chunk_length).tolist() for i, chunk_size in enumerate(chunk_length): chunk_op = op.copy().reset_key() - chunk_op._periods = chunk_size - if closed != "right" or i > 0: - chunk_op._closed = None + chunk_op.periods = chunk_size + + if i > 0 or inclusive not in ("neither", "right"): + # for chunks in the middle, all sides are inclusive + chunk_op.inclusive = "both" + elif 0 == i and inclusive == "neither": + chunk_op.inclusive = "right" + chunk_i_start = cum_nsplit[i] if chunk_i_start > 0: - chunk_start = chunk_op._start = start + freq * chunk_i_start + chunk_start = chunk_op.start = start + freq * chunk_i_start else: - chunk_start = chunk_op._start = start - chunk_end = chunk_op._end = chunk_start + (chunk_size - 1) * freq + chunk_start = chunk_op.start = start + chunk_end = chunk_op.end = chunk_start + (chunk_size - 1) * freq # gen chunk index_value chunk_index_value = parse_index(out.index_value.to_pandas(), i, out) @@ -188,7 +149,11 @@ def tile(cls, op): chunk_index_value.value._is_unique = True chunk_index_value.value._is_monotonic_increasing = True - size = chunk_size - 1 if i == 0 and closed == "right" else chunk_size + size = ( + chunk_size - 1 + if i == 0 and inclusive in ("neither", "right") + else chunk_size + ) out_chunk = chunk_op.new_chunk( None, shape=(size,), @@ -206,12 +171,12 @@ def tile(cls, op): return new_op.new_indexes(None, kws=[params]) @classmethod - def execute(cls, ctx, op): + def execute(cls, ctx, op: "DataFrameDateRange"): start, end, periods = op.start, op.end, op.periods freq = op.freq if freq is not None: end = None - ctx[op.outputs[0].key] = pd.date_range( + kw = dict( start=start, end=end, periods=periods, @@ -219,8 +184,13 @@ def execute(cls, ctx, op): tz=op.tz, normalize=op.normalize, name=op.name, - closed=op.closed, + inclusive=op.inclusive, ) + if not _date_range_use_inclusive: + closed = kw.pop("inclusive") + assert closed != "neither" + kw["closed"] = None if closed == "both" else closed + ctx[op.outputs[0].key] = pd.date_range(**kw) _midnight = time(0, 0) @@ -330,7 +300,8 @@ def date_range( tz=None, normalize=False, name=None, - closed=None, + closed=NoDefault.no_default, + inclusive=None, chunk_size=None, **kwargs, ): @@ -357,9 +328,8 @@ def date_range( Normalize start/end dates to midnight before generating date range. name : str, default None Name of the resulting DatetimeIndex. - closed : {None, 'left', 'right'}, optional - Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None, the default). + inclusive : {“both”, “neither”, “left”, “right”}, default “both” + Include boundaries; Whether to set each bound as closed or open. **kwargs For compatibility. Has no effect on the result. @@ -452,24 +422,29 @@ def date_range( '2018-01-05 00:00:00+09:00'], dtype='datetime64[ns, Asia/Tokyo]', freq='D') - `closed` controls whether to include `start` and `end` that are on the - boundary. The default includes boundary points on either end. + `inclusive` controls whether to include `start` and `end` that are on the + boundary. The default, "both", includes boundary points on either end. - >>> md.date_range(start='2017-01-01', end='2017-01-04', closed=None).execute() + >>> md.date_range(start='2017-01-01', end='2017-01-04', inclusive='both').execute() DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') - Use ``closed='left'`` to exclude `end` if it falls on the boundary. + Use ``inclusive='left'`` to exclude `end` if it falls on the boundary. >>> md.date_range(start='2017-01-01', end='2017-01-04', closed='left').execute() DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq='D') - Use ``closed='right'`` to exclude `start` if it falls on the boundary. + Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, + and similarly inclusive='neither' will exclude both `start` and `end`. >>> md.date_range(start='2017-01-01', end='2017-01-04', closed='right').execute() DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') + + .. note:: + Pandas 1.4.0 or later is required to use ``inclusive='neither'``. + Otherwise an error may be raised. """ # validate periods if isinstance(periods, (float, np.floating)): @@ -486,6 +461,16 @@ def date_range( ) freq = to_offset(freq) + if _date_range_use_inclusive and closed is not NoDefault.no_default: + warnings.warn( + "Argument `closed` is deprecated in favor of `inclusive`.", FutureWarning + ) + elif closed is NoDefault.no_default: + closed = None + + if inclusive is None and closed is not NoDefault.no_default: + inclusive = closed + if start is not None: start = pd.Timestamp(start) @@ -502,33 +487,37 @@ def date_range( # start is None and end is not None # adjust end first end = pd.date_range(end=end, periods=1, freq=freq)[0] + if inclusive == "neither": + end -= freq size = periods start = end - (periods - 1) * freq - if closed == "left": + if inclusive in ("neither", "left"): size -= 1 - elif closed == "right": + elif inclusive == "right": # when start is None, closed == 'left' would not take effect # thus just ignore - closed = None + inclusive = "both" elif end is None: # end is None # adjust start first start = pd.date_range(start=start, periods=1, freq=freq)[0] size = periods end = start + (periods - 1) * freq - if closed == "right": + if inclusive in ("neither", "right"): size -= 1 - elif closed == "left": + elif inclusive == "left": # when end is None, closed == 'left' would not take effect # thus just ignore - closed = None + inclusive = "both" else: if periods is None: periods = size = int((end - start) / freq + 1) else: size = periods - if closed is not None: + if inclusive in ("left", "right"): size -= 1 + elif inclusive == "neither": + size -= 2 shape = (size,) op = DataFrameDateRange( @@ -538,8 +527,8 @@ def date_range( freq=freq, tz=tz, normalize=normalize, - closed=closed, name=name, + inclusive=inclusive, **kwargs, ) return op(shape, chunk_size=chunk_size) diff --git a/mars/dataframe/datasource/tests/test_datasource_execution.py b/mars/dataframe/datasource/tests/test_datasource_execution.py index a299aeeac2..9c2238b2ce 100644 --- a/mars/dataframe/datasource/tests/test_datasource_execution.py +++ b/mars/dataframe/datasource/tests/test_datasource_execution.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import itertools import os import tempfile import time @@ -41,7 +42,7 @@ from .... import dataframe as md from ....config import option_context from ....tests.core import require_cudf, require_ray -from ....utils import arrow_array_to_objects, lazy_import +from ....utils import arrow_array_to_objects, lazy_import, pd_release_version from ..dataframe import from_pandas as from_pandas_df from ..series import from_pandas as from_pandas_series from ..index import from_pandas as from_pandas_index, from_tileable @@ -50,6 +51,7 @@ ray = lazy_import("ray") +_date_range_use_inclusive = pd_release_version[:2] >= (1, 4) def test_from_pandas_dataframe_execution(setup): @@ -908,45 +910,63 @@ def test_read_sql_use_arrow_dtype(setup): ) +@pytest.mark.pd_compat def test_date_range_execution(setup): - for closed in [None, "left", "right"]: + chunk_sizes = [None, 3] + inclusives = ["both", "neither", "left", "right"] + + if _date_range_use_inclusive: + with pytest.warns(FutureWarning, match="closed"): + md.date_range("2020-1-1", periods=10, closed="right") + + for chunk_size, inclusive in itertools.product(chunk_sizes, inclusives): + kw = dict() + if _date_range_use_inclusive: + kw["inclusive"] = inclusive + else: + if inclusive == "neither": + continue + elif inclusive == "both": + inclusive = None + kw["closed"] = inclusive + # start, periods, freq - dr = md.date_range("2020-1-1", periods=10, chunk_size=3, closed=closed) + dr = md.date_range("2020-1-1", periods=10, chunk_size=chunk_size, **kw) result = dr.execute().fetch() - expected = pd.date_range("2020-1-1", periods=10, closed=closed) + expected = pd.date_range("2020-1-1", periods=10, **kw) pd.testing.assert_index_equal(result, expected) # end, periods, freq - dr = md.date_range(end="2020-1-10", periods=10, chunk_size=3, closed=closed) + dr = md.date_range(end="2020-1-10", periods=10, chunk_size=chunk_size, **kw) result = dr.execute().fetch() - expected = pd.date_range(end="2020-1-10", periods=10, closed=closed) + expected = pd.date_range(end="2020-1-10", periods=10, **kw) pd.testing.assert_index_equal(result, expected) # start, end, freq - dr = md.date_range("2020-1-1", "2020-1-10", chunk_size=3, closed=closed) + dr = md.date_range("2020-1-1", "2020-1-10", chunk_size=chunk_size, **kw) result = dr.execute().fetch() - expected = pd.date_range("2020-1-1", "2020-1-10", closed=closed) + expected = pd.date_range("2020-1-1", "2020-1-10", **kw) pd.testing.assert_index_equal(result, expected) # start, end and periods dr = md.date_range( - "2020-1-1", "2020-1-10", periods=19, chunk_size=3, closed=closed + "2020-1-1", "2020-1-10", periods=19, chunk_size=chunk_size, **kw ) result = dr.execute().fetch() - expected = pd.date_range("2020-1-1", "2020-1-10", periods=19, closed=closed) + expected = pd.date_range("2020-1-1", "2020-1-10", periods=19, **kw) pd.testing.assert_index_equal(result, expected) # start, end and freq dr = md.date_range( - "2020-1-1", "2020-1-10", freq="12H", chunk_size=3, closed=closed + "2020-1-1", "2020-1-10", freq="12H", chunk_size=chunk_size, **kw ) result = dr.execute().fetch() - expected = pd.date_range("2020-1-1", "2020-1-10", freq="12H", closed=closed) + expected = pd.date_range("2020-1-1", "2020-1-10", freq="12H", **kw) pd.testing.assert_index_equal(result, expected) # test timezone diff --git a/mars/dataframe/indexing/set_index.py b/mars/dataframe/indexing/set_index.py index 82bceec937..f8d4190fb7 100644 --- a/mars/dataframe/indexing/set_index.py +++ b/mars/dataframe/indexing/set_index.py @@ -116,7 +116,7 @@ def _tile_column_axis_n_chunk(cls, op, in_df, out_df, out_chunks): shape=new_shape, dtypes=dtypes, index=input_chunk.index, - index_value=parse_index(pd.Int64Index([])), + index_value=parse_index(pd.Index([], dtype=np.int64)), columns_value=columns, ) out_chunks.append(out_chunk) diff --git a/mars/dataframe/sort/psrs.py b/mars/dataframe/sort/psrs.py index 41cc17d692..5defef7f24 100644 --- a/mars/dataframe/sort/psrs.py +++ b/mars/dataframe/sort/psrs.py @@ -183,7 +183,7 @@ def partition_local_data( **cls._collect_op_properties(op) ) if isinstance(chunk_inputs[0].index_value.value, IndexValue.RangeIndex): - index_value = parse_index(pd.Int64Index([])) + index_value = parse_index(pd.Index([], dtype=np.int64)) else: index_value = chunk_inputs[0].index_value kw = dict( diff --git a/mars/dataframe/sort/sort_values.py b/mars/dataframe/sort/sort_values.py index b96f83758e..3bf94ed893 100644 --- a/mars/dataframe/sort/sort_values.py +++ b/mars/dataframe/sort/sort_values.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np import pandas as pd from ... import opcodes as OperandDef @@ -117,7 +118,7 @@ def __call__(self, a): index_value = parse_index(pd.RangeIndex(a.shape[0])) else: if isinstance(a.index_value.value, IndexValue.RangeIndex): - index_value = parse_index(pd.Int64Index([])) + index_value = parse_index(pd.Index([], dtype=np.int64)) else: index_value = a.index_value if a.ndim == 2: diff --git a/mars/dataframe/tests/test_utils.py b/mars/dataframe/tests/test_utils.py index 2740d46a64..27109fc3bf 100644 --- a/mars/dataframe/tests/test_utils.py +++ b/mars/dataframe/tests/test_utils.py @@ -109,12 +109,12 @@ def test_decide_series_chunks(): def test_parse_index(): - index = pd.Int64Index([]) + index = pd.Index([], dtype=np.int64) parsed_index = parse_index(index) assert isinstance(parsed_index.value, IndexValue.Int64Index) pd.testing.assert_index_equal(index, parsed_index.to_pandas()) - index = pd.Int64Index([1, 2]) + index = pd.Index([1, 2], dtype=np.int64) parsed_index = parse_index(index) # not parse data assert isinstance(parsed_index.value, IndexValue.Int64Index) with pytest.raises(AssertionError): @@ -325,7 +325,7 @@ def test_filter_index_value(): == pd_index[(pd_index > 2) & (pd_index < 10)].tolist() ) - pd_index = pd.Int64Index([0, 3, 8]) + pd_index = pd.Index([0, 3, 8], dtype=np.int64) index_value = parse_index(pd_index, store_data=True) min_max = (2, True, 8, False) @@ -397,8 +397,8 @@ def test_infer_index_value(): assert oival.key != ival2.key # same int64 index, all unique - index1 = pd.Int64Index([1, 2]) - index2 = pd.Int64Index([1, 2]) + index1 = pd.Index([1, 2], dtype=np.int64) + index2 = pd.Index([1, 2], dtype=np.int64) ival1 = parse_index(index1) ival2 = parse_index(index2) @@ -409,8 +409,8 @@ def test_infer_index_value(): assert oival.key == ival2.key # same int64 index, not all unique - index1 = pd.Int64Index([1, 2, 2]) - index2 = pd.Int64Index([1, 2, 2]) + index1 = pd.Index([1, 2, 2], dtype=np.int64) + index2 = pd.Index([1, 2, 2], dtype=np.int64) ival1 = parse_index(index1) ival2 = parse_index(index2) @@ -421,8 +421,8 @@ def test_infer_index_value(): assert oival.key != ival2.key # different int64 index - index1 = pd.Int64Index([1, 2]) - index2 = pd.Int64Index([2, 3]) + index1 = pd.Index([1, 2], dtype=np.int64) + index2 = pd.Index([2, 3], dtype=np.int64) ival1 = parse_index(index1) ival2 = parse_index(index2) @@ -433,8 +433,8 @@ def test_infer_index_value(): assert oival.key != ival2.key # different index type - index1 = pd.Int64Index([1, 2]) - index2 = pd.Float64Index([2.0, 3.0]) + index1 = pd.Index([1, 2], dtype=np.int64) + index2 = pd.Index([2.0, 3.0], dtype=np.float64) ival1 = parse_index(index1) ival2 = parse_index(index2) @@ -446,7 +446,7 @@ def test_infer_index_value(): # range index and other index index1 = pd.RangeIndex(1, 4) - index2 = pd.Float64Index([2, 3, 4]) + index2 = pd.Index([2, 3, 4], dtype=np.float64) ival1 = parse_index(index1) ival2 = parse_index(index2) diff --git a/mars/dataframe/utils.py b/mars/dataframe/utils.py index 883f088a70..5eea2e4c29 100644 --- a/mars/dataframe/utils.py +++ b/mars/dataframe/utils.py @@ -771,7 +771,9 @@ def infer_index_value(left_index_value, right_index_value): ): if left_index_value.value.slice == right_index_value.value.slice: return left_index_value - return parse_index(pd.Int64Index([]), left_index_value, right_index_value) + return parse_index( + pd.Index([], dtype=np.int64), left_index_value, right_index_value + ) # when left index and right index is identical, and both of them are elements unique, # we can infer that the out index should be identical also diff --git a/mars/learn/ensemble/_bagging.py b/mars/learn/ensemble/_bagging.py index f89d404b29..1e5483809f 100644 --- a/mars/learn/ensemble/_bagging.py +++ b/mars/learn/ensemble/_bagging.py @@ -659,7 +659,7 @@ def __call__(self, data: TileableType, feature_indices: TileableType = None): inputs.append(feature_indices) params["shape"] = (data.shape[0], np.nan) if isinstance(data, DATAFRAME_TYPE): - params["index_value"] = parse_index(pd.Int64Index([]), data.key) + params["index_value"] = parse_index(pd.Index([], dtype=np.int64), data.key) return self.new_tileable(inputs, **params) @classmethod @@ -707,7 +707,7 @@ def _gen_combine_chunk(chunks): params["index"] = (chunks[0].index[0], chunks[0].index[2]) if isinstance(t_data, DATAFRAME_TYPE): params["index_value"] = parse_index( - pd.Int64Index([]), chunks[0].key + pd.Index([], dtype=np.int64), chunks[0].key ) inputs = chunks.tolist() return new_op.new_chunk(inputs, **params) diff --git a/mars/services/scheduling/worker/tests/test_execution.py b/mars/services/scheduling/worker/tests/test_execution.py index e50a3714da..cce12d3443 100644 --- a/mars/services/scheduling/worker/tests/test_execution.py +++ b/mars/services/scheduling/worker/tests/test_execution.py @@ -397,7 +397,7 @@ def test_estimate_size(): from .....dataframe.fetch import DataFrameFetch from .....dataframe.utils import parse_index - index_value = parse_index(pd.Int64Index([10, 20, 30])) + index_value = parse_index(pd.Index([10, 20, 30], dtype=np.int64)) input1 = DataFrameFetch(output_types=[OutputType.series],).new_chunk( [], _key="INPUT1", shape=(np.nan,), dtype=np.dtype("O"), index_value=index_value diff --git a/mars/tensor/statistics/bincount.py b/mars/tensor/statistics/bincount.py index c42f37985c..a6416d100c 100644 --- a/mars/tensor/statistics/bincount.py +++ b/mars/tensor/statistics/bincount.py @@ -118,7 +118,7 @@ def tile(cls, op: "TensorBinCount"): dtype=out.dtype, shape=(np.nan,), index=a_chunk.index, - index_value=parse_index(pd.Int64Index([0]), a_chunk.key), + index_value=parse_index(pd.Index([0], dtype=np.int64), a_chunk.key), ) )