diff --git a/python/bootstrap.py b/python/bootstrap.py index 4fb2b0e91..85ce45438 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -66,6 +66,9 @@ def generate_nanoarrow_pxd(self, file_in, file_out): output.write(b" ctypedef int ArrowErrorCode\n") output.write(b" cdef int NANOARROW_OK\n") output.write(b" cdef int NANOARROW_MAX_FIXED_BUFFERS\n") + output.write(b" cdef int ARROW_FLAG_DICTIONARY_ORDERED\n") + output.write(b" cdef int ARROW_FLAG_NULLABLE\n") + output.write(b" cdef int ARROW_FLAG_MAP_KEYS_SORTED\n") output.write(b"\n") for type in types_cython: diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index 318493f8a..ae8f8204c 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -26,4 +26,40 @@ allocate_c_array, allocate_c_array_stream, ) +from nanoarrow.schema import ( # noqa: F401 + Schema, + Type, + TimeUnit, + schema, + null, + bool, + int8, + uint8, + int16, + uint16, + int32, + uint32, + int64, + uint64, + float16, + float32, + float64, + string, + large_string, + binary, + large_binary, + fixed_size_binary, + date32, + date64, + time32, + time64, + timestamp, + duration, + interval_months, + interval_day_time, + interval_month_day_nano, + decimal128, + decimal256, + struct, +) from nanoarrow._version import __version__ # noqa: F401 diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx index 54d7bbdc0..67e3f1232 100644 --- a/python/src/nanoarrow/_lib.pyx +++ b/python/src/nanoarrow/_lib.pyx @@ -221,6 +221,69 @@ cdef class Error: raise NanoarrowException(what, code, "") +# This could in theory use cpdef enum, but an initial attempt to do so +# resulted Cython duplicating some function definitions. For now, we resort +# to a more manual trampoline of values to make them accessible from +# schema.py. +cdef class CArrowType: + """ + Wrapper around ArrowType to provide implementations in Python access + to the values. + """ + + UNINITIALIZED = NANOARROW_TYPE_UNINITIALIZED + NA = NANOARROW_TYPE_NA + BOOL = NANOARROW_TYPE_BOOL + UINT8 = NANOARROW_TYPE_UINT8 + INT8 = NANOARROW_TYPE_INT8 + UINT16 = NANOARROW_TYPE_UINT16 + INT16 = NANOARROW_TYPE_INT16 + UINT32 = NANOARROW_TYPE_UINT32 + INT32 = NANOARROW_TYPE_INT32 + UINT64 = NANOARROW_TYPE_UINT64 + INT64 = NANOARROW_TYPE_INT64 + HALF_FLOAT = NANOARROW_TYPE_HALF_FLOAT + FLOAT = NANOARROW_TYPE_FLOAT + DOUBLE = NANOARROW_TYPE_DOUBLE + STRING = NANOARROW_TYPE_STRING + BINARY = NANOARROW_TYPE_BINARY + FIXED_SIZE_BINARY = NANOARROW_TYPE_FIXED_SIZE_BINARY + DATE32 = NANOARROW_TYPE_DATE32 + DATE64 = NANOARROW_TYPE_DATE64 + TIMESTAMP = NANOARROW_TYPE_TIMESTAMP + TIME32 = NANOARROW_TYPE_TIME32 + TIME64 = NANOARROW_TYPE_TIME64 + INTERVAL_MONTHS = NANOARROW_TYPE_INTERVAL_MONTHS + INTERVAL_DAY_TIME = NANOARROW_TYPE_INTERVAL_DAY_TIME + DECIMAL128 = NANOARROW_TYPE_DECIMAL128 + DECIMAL256 = NANOARROW_TYPE_DECIMAL256 + LIST = NANOARROW_TYPE_LIST + STRUCT = NANOARROW_TYPE_STRUCT + SPARSE_UNION = NANOARROW_TYPE_SPARSE_UNION + DENSE_UNION = NANOARROW_TYPE_DENSE_UNION + DICTIONARY = NANOARROW_TYPE_DICTIONARY + MAP = NANOARROW_TYPE_MAP + EXTENSION = NANOARROW_TYPE_EXTENSION + FIXED_SIZE_LIST = NANOARROW_TYPE_FIXED_SIZE_LIST + DURATION = NANOARROW_TYPE_DURATION + LARGE_STRING = NANOARROW_TYPE_LARGE_STRING + LARGE_BINARY = NANOARROW_TYPE_LARGE_BINARY + LARGE_LIST = NANOARROW_TYPE_LARGE_LIST + INTERVAL_MONTH_DAY_NANO = NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO + + +cdef class CArrowTimeUnit: + """ + Wrapper around ArrowTimeUnit to provide implementations in Python access + to the values. + """ + + SECOND = NANOARROW_TIME_UNIT_SECOND + MILLI = NANOARROW_TIME_UNIT_MILLI + MICRO = NANOARROW_TIME_UNIT_MICRO + NANO = NANOARROW_TIME_UNIT_NANO + + cdef class CSchema: """Low-level ArrowSchema wrapper @@ -243,9 +306,16 @@ cdef class CSchema: return CSchema(base, (c_schema_out)) def __cinit__(self, object base, uintptr_t addr): - self._base = base, + self._base = base self._ptr = addr + def __deepcopy__(self): + cdef CSchema out = CSchema.allocate() + cdef int result = ArrowSchemaDeepCopy(self._ptr, out._ptr) + if result != NANOARROW_OK: + raise NanoarrowException("ArrowSchemaDeepCopy()", result) + return out + @staticmethod def _import_from_c_capsule(schema_capsule): """ @@ -278,6 +348,23 @@ cdef class CSchema: Error.raise_error("ArrowSchemaDeepCopy", result) return schema_capsule + @property + def _capsule(self): + """ + Returns the capsule backing this CSchema or None if it does not exist + or points to a parent ArrowSchema. + """ + cdef ArrowSchema* maybe_capsule_ptr + maybe_capsule_ptr = PyCapsule_GetPointer(self._base, 'arrow_schema') + + # This will return False if this is a child CSchema whose capsule holds + # the parent ArrowSchema + if maybe_capsule_ptr == self._ptr: + return self._base + + return None + + def _addr(self): return self._ptr @@ -376,6 +463,10 @@ cdef class CSchemaView: # lifetime guarantees that the pointed-to data from ArrowStringViews remains valid cdef object _base cdef ArrowSchemaView _schema_view + # Not part of the ArrowSchemaView (but possibly should be) + cdef bint _dictionary_ordered + cdef bint _nullable + cdef bint _map_keys_sorted _fixed_size_types = ( NANOARROW_TYPE_FIXED_SIZE_LIST, @@ -409,6 +500,18 @@ cdef class CSchemaView: if result != NANOARROW_OK: error.raise_message("ArrowSchemaViewInit()", result) + self._dictionary_ordered = schema._ptr.flags & ARROW_FLAG_DICTIONARY_ORDERED + self._nullable = schema._ptr.flags & ARROW_FLAG_NULLABLE + self._map_keys_sorted = schema._ptr.flags & ARROW_FLAG_MAP_KEYS_SORTED + + @property + def type_id(self): + return self._schema_view.type + + @property + def storage_type_id(self): + return self._schema_view.storage_type + @property def type(self): cdef const char* type_str = ArrowTypeString(self._schema_view.type) @@ -421,6 +524,18 @@ cdef class CSchemaView: if type_str != NULL: return type_str.decode('UTF-8') + @property + def dictionary_ordered(self): + return self._dictionary_ordered != 0 + + @property + def nullable(self): + return self._nullable != 0 + + @property + def map_keys_sorted(self): + return self._map_keys_sorted != 0 + @property def fixed_size(self): if self._schema_view.type in CSchemaView._fixed_size_types: @@ -441,6 +556,11 @@ cdef class CSchemaView: if self._schema_view.type in CSchemaView._decimal_types: return self._schema_view.decimal_scale + @property + def time_unit_id(self): + if self._schema_view.type in CSchemaView._time_unit_types: + return self._schema_view.time_unit + @property def time_unit(self): if self._schema_view.type in CSchemaView._time_unit_types: @@ -478,6 +598,130 @@ cdef class CSchemaView: def __repr__(self): return _lib_utils.schema_view_repr(self) + +cdef class CSchemaBuilder: + cdef CSchema c_schema + cdef ArrowSchema* _ptr + + def __cinit__(self, CSchema schema): + self.c_schema = schema + self._ptr = schema._ptr + if self._ptr.release == NULL: + ArrowSchemaInit(self._ptr) + + @staticmethod + def allocate(): + return CSchemaBuilder(CSchema.allocate()) + + def child(self, int64_t i): + return CSchemaBuilder(self.c_schema.child(i)) + + def set_type(self, int type_id): + self.c_schema._assert_valid() + + cdef int result = ArrowSchemaSetType(self._ptr, type_id) + if result != NANOARROW_OK: + Error.raise_error("ArrowSchemaSetType()", result) + + return self + + def set_type_decimal(self, int type_id, int precision, int scale): + self.c_schema._assert_valid() + + cdef int result = ArrowSchemaSetTypeDecimal(self._ptr, type_id, precision, scale) + if result != NANOARROW_OK: + Error.raise_error("ArrowSchemaSetType()", result) + + def set_type_fixed_size(self, int type_id, int fixed_size): + self.c_schema._assert_valid() + + cdef int result = ArrowSchemaSetTypeFixedSize(self._ptr, type_id, fixed_size) + if result != NANOARROW_OK: + Error.raise_error("ArrowSchemaSetTypeFixedSize()", result) + + return self + + def set_type_date_time(self, int type_id, int time_unit, timezone): + self.c_schema._assert_valid() + + cdef int result + if timezone is None: + result = ArrowSchemaSetTypeDateTime(self._ptr, type_id, time_unit, NULL) + else: + timezone = str(timezone) + result = ArrowSchemaSetTypeDateTime(self._ptr, type_id, time_unit, timezone.encode("UTF-8")) + + if result != NANOARROW_OK: + Error.raise_error("ArrowSchemaSetTypeDateTime()", result) + + return self + + def set_format(self, str format): + self.c_schema._assert_valid() + + cdef int result = ArrowSchemaSetFormat(self._ptr, format.encode("UTF-8")) + if result != NANOARROW_OK: + Error.raise_error("ArrowSchemaSetFormat()", result) + + return self + + def set_name(self, name): + self.c_schema._assert_valid() + + cdef int result + if name is None: + result = ArrowSchemaSetName(self._ptr, NULL) + else: + name = str(name) + result = ArrowSchemaSetName(self._ptr, name.encode("UTF-8")) + + if result != NANOARROW_OK: + Error.raise_error("ArrowSchemaSetName()", result) + + return self + + def allocate_children(self, int n): + self.c_schema._assert_valid() + + cdef int result = ArrowSchemaAllocateChildren(self._ptr, n) + if result != NANOARROW_OK: + Error.raise_error("ArrowSchemaAllocateChildren()", result) + + return self + + def set_child(self, int64_t i, name, CSchema child_src): + self.c_schema._assert_valid() + + if i < 0 or i >= self._ptr.n_children: + raise IndexError(f"Index out of range: {i}") + + if self._ptr.children[i].release != NULL: + ArrowSchemaRelease(self._ptr.children[i]) + + cdef int result = ArrowSchemaDeepCopy(child_src._ptr, self._ptr.children[i]) + if result != NANOARROW_OK: + Error.raise_error("", result) + + if name is not None: + name = str(name) + result = ArrowSchemaSetName(self._ptr.children[i], name.encode("UTF-8")) + + return self + + def set_nullable(self, nullable): + if nullable: + self._ptr.flags = self._ptr.flags | ARROW_FLAG_NULLABLE + else: + self._ptr.flags = self._ptr.flags & ~ARROW_FLAG_NULLABLE + + return self + + def finish(self): + self.c_schema._assert_valid() + + return self.c_schema + + cdef class CArray: """Low-level ArrowArray wrapper diff --git a/python/src/nanoarrow/schema.py b/python/src/nanoarrow/schema.py new file mode 100644 index 000000000..ca1ebccff --- /dev/null +++ b/python/src/nanoarrow/schema.py @@ -0,0 +1,1023 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum +import reprlib +from typing import Union + +from nanoarrow._lib import CArrowTimeUnit, CArrowType, CSchemaBuilder, CSchemaView +from nanoarrow.c_lib import c_schema + + +class Type(enum.Enum): + """The Type enumerator provides a means by which the various type + categories can be identified. Type values can be used in place of + :class:`Schema` instances in most places for parameter-free types. + """ + + UNINITIALIZED = CArrowType.UNINITIALIZED + NULL = CArrowType.NA + BOOL = CArrowType.BOOL + UINT8 = CArrowType.UINT8 + INT8 = CArrowType.INT8 + UINT16 = CArrowType.UINT16 + INT16 = CArrowType.INT16 + UINT32 = CArrowType.UINT32 + INT32 = CArrowType.INT32 + UINT64 = CArrowType.UINT64 + INT64 = CArrowType.INT64 + HALF_FLOAT = CArrowType.HALF_FLOAT + FLOAT = CArrowType.FLOAT + DOUBLE = CArrowType.DOUBLE + STRING = CArrowType.STRING + BINARY = CArrowType.BINARY + FIXED_SIZE_BINARY = CArrowType.FIXED_SIZE_BINARY + DATE32 = CArrowType.DATE32 + DATE64 = CArrowType.DATE64 + TIMESTAMP = CArrowType.TIMESTAMP + TIME32 = CArrowType.TIME32 + TIME64 = CArrowType.TIME64 + INTERVAL_MONTHS = CArrowType.INTERVAL_MONTHS + INTERVAL_DAY_TIME = CArrowType.INTERVAL_DAY_TIME + DECIMAL128 = CArrowType.DECIMAL128 + DECIMAL256 = CArrowType.DECIMAL256 + LIST = CArrowType.LIST + STRUCT = CArrowType.STRUCT + SPARSE_UNION = CArrowType.SPARSE_UNION + DENSE_UNION = CArrowType.DENSE_UNION + DICTIONARY = CArrowType.DICTIONARY + MAP = CArrowType.MAP + EXTENSION = CArrowType.EXTENSION + FIXED_SIZE_LIST = CArrowType.FIXED_SIZE_LIST + DURATION = CArrowType.DURATION + LARGE_STRING = CArrowType.LARGE_STRING + LARGE_BINARY = CArrowType.LARGE_BINARY + LARGE_LIST = CArrowType.LARGE_LIST + INTERVAL_MONTH_DAY_NANO = CArrowType.INTERVAL_MONTH_DAY_NANO + + def __arrow_c_schema__(self): + # This will only work for parameter-free types + c_schema = CSchemaBuilder.allocate().set_type(self.value).set_name("").finish() + return c_schema._capsule + + +class TimeUnit(enum.Enum): + """Unit enumerator for timestamp, duration, and time types.""" + + SECOND = CArrowTimeUnit.SECOND + MILLI = CArrowTimeUnit.MILLI + MICRO = CArrowTimeUnit.MICRO + NANO = CArrowTimeUnit.NANO + + @staticmethod + def create(obj): + """Create a TimeUnit from parameter input. + + This constructor will accept the abbreviations "s", "ms", "us", and "ns" + and return the appropriate enumerator value. + + >>> import nanoarrow as na + >>> na.TimeUnit.create("s") + + """ + + if isinstance(obj, str): + if obj == "s": + return TimeUnit.SECOND + elif obj == "ms": + return TimeUnit.MILLI + elif obj == "us": + return TimeUnit.MICRO + elif obj == "ns": + return TimeUnit.NANO + + return TimeUnit(obj) + + +class Schema: + """The Schema is nanoarrow's high-level data type representation whose scope maps to + that of the ArrowSchema in the Arrow C Data interface. See :func:`schema` for class + details. + """ + + def __init__( + self, + obj, + *, + name=None, + nullable=None, + **params, + ) -> None: + if isinstance(obj, Type): + self._c_schema = _c_schema_from_type_and_params(obj, params, name, nullable) + elif not params and nullable is None and name is None: + self._c_schema = c_schema(obj) + else: + # A future version could also deep copy the schema and update it if these + # values *are* specified. + raise ValueError( + "params, nullable, and name must be unspecified if type is not " + "nanoarrow.Type" + ) + + self._c_schema_view = CSchemaView(self._c_schema) + + @property + def type(self) -> Type: + """Type enumerator value of this Schema + + >>> import nanoarrow as na + >>> na.int32().type + + """ + return Type(self._c_schema_view.type_id) + + @property + def name(self) -> Union[str, None]: + """Field name of this Schema + + >>> import nanoarrow as na + >>> schema = na.struct({"col1": na.int32()}) + >>> schema.field(0).name + 'col1' + """ + return self._c_schema.name + + @property + def nullable(self) -> bool: + """Nullability of this field + + >>> import nanoarrow as na + >>> na.int32().nullable + True + >>> na.int32(nullable=False).nullable + False + """ + return self._c_schema_view.nullable + + @property + def byte_width(self) -> Union[int, None]: + """Element byte width for fixed-size binary type + + Returns ``None`` for types for which this property is not relevant. + + >>> import nanoarrow as na + >>> na.fixed_size_binary(123).byte_width + 123 + """ + + if self._c_schema_view.type_id == CArrowType.FIXED_SIZE_BINARY: + return self._c_schema_view.fixed_size + + @property + def unit(self) -> Union[TimeUnit, None]: + """TimeUnit for timestamp, time, and duration types + + Returns ``None`` for types for which this property is not relevant. + + >>> import nanoarrow as na + >>> na.timestamp(na.TimeUnit.SECOND).unit + + """ + + unit_id = self._c_schema_view.time_unit_id + if unit_id is not None: + return TimeUnit(unit_id) + + @property + def timezone(self) -> Union[str, None]: + """Timezone for timestamp types + + Returns ``None`` for types for which this property is not relevant or + for timezone types for which the timezone is not set. + + >>> import nanoarrow as na + >>> na.timestamp(na.TimeUnit.SECOND, timezone="America/Halifax").timezone + 'America/Halifax' + """ + if self._c_schema_view.timezone: + return self._c_schema_view.timezone + + @property + def precision(self) -> int: + """Decimal precision + + >>> import nanoarrow as na + >>> na.decimal128(10, 3).precision + 10 + """ + return self._c_schema_view.decimal_precision + + @property + def scale(self) -> int: + """Decimal scale + + >>> import nanoarrow as na + >>> na.decimal128(10, 3).scale + 3 + """ + + return self._c_schema_view.decimal_scale + + @property + def n_fields(self) -> int: + """Number of child Schemas + + >>> import nanoarrow as na + >>> schema = na.struct({"col1": na.int32()}) + >>> schema.n_fields + 1 + """ + + return self._c_schema.n_children + + def field(self, i): + """Extract a child Schema + + >>> import nanoarrow as na + >>> schema = na.struct({"col1": na.int32()}) + >>> schema.field(0) + Schema(INT32, name='col1') + """ + + # Returning a copy to reduce interdependence between Schema instances: + # The CSchema keeps its parent alive when wrapping a child, which might + # be unexpected if the parent schema is very large. + return Schema(self._c_schema.child(i).__deepcopy__()) + + @property + def fields(self): + """Iterate over child Schemas + + >>> import nanoarrow as na + >>> schema = na.struct({"col1": na.int32()}) + >>> for field in schema.fields: + ... print(field.name) + ... + col1 + """ + for i in range(self.n_fields): + yield self.field(i) + + def __repr__(self) -> str: + return _schema_repr(self) + + def __arrow_c_schema__(self): + return self._c_schema.__arrow_c_schema__() + + +def schema(obj, *, name=None, nullable=None, **params): + """Create a nanoarrow Schema + + The Schema is nanoarrow's high-level data type representation, encompasing + the role of PyArrow's ``Schema``, ``Field``, and ``DataType``. This scope + maps to that of the ArrowSchema in the Arrow C Data interface. + + Parameters + ---------- + obj : + A :class:`Type` specifier or a schema-like object. A schema-like object + includes: + * A ``pyarrow.Schema``, `pyarrow.Field``, or ``pyarrow.DataType`` + * A nanoarrow :class:`Schema`, :class:`CSchema`, or :class:`Type` + * Any object implementing the Arrow PyCapsule interface + ``__arrow_c_schema__()`` protocol method. + + name : str, optional + An optional name to bind to this field. + + nullable : bool, optional + Explicitly specify field nullability. Fields are nullable by default. + Only supported if ``obj`` is a :class:`Type` object (for any other input, + the nullability is preserved from the passed object). + + **params + Type-specific parameters when ``obj`` is a :class:`Type`. + + Examples + -------- + + >>> import nanoarrow as na + >>> import pyarrow as pa + >>> na.schema(na.Type.INT32) + Schema(INT32) + >>> na.schema(na.Type.DURATION, unit=na.TimeUnit.SECOND) + Schema(DURATION, unit=SECOND) + >>> na.schema(pa.int32()) + Schema(INT32) + """ + return Schema(obj, name=name, nullable=nullable, **params) + + +def null(nullable: bool = True) -> Schema: + """Create an instance of a null type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.null() + Schema(NULL) + """ + return Schema(Type.NULL, nullable=nullable) + + +def bool(nullable: bool = True) -> Schema: + """Create an instance of a boolean type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.bool() + Schema(BOOL) + """ + return Schema(Type.BOOL, nullable=nullable) + + +def int8(nullable: bool = True) -> Schema: + """Create an instance of a signed 8-bit integer type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.int8() + Schema(INT8) + """ + return Schema(Type.INT8, nullable=nullable) + + +def uint8(nullable: bool = True) -> Schema: + """Create an instance of an unsigned 8-bit integer type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.uint8() + Schema(UINT8) + """ + return Schema(Type.UINT8, nullable=nullable) + + +def int16(nullable: bool = True) -> Schema: + """Create an instance of a signed 16-bit integer type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.int16() + Schema(INT16) + """ + return Schema(Type.INT16, nullable=nullable) + + +def uint16(nullable: bool = True) -> Schema: + """Create an instance of an unsigned 16-bit integer type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.uint16() + Schema(UINT16) + """ + return Schema(Type.UINT16, nullable=nullable) + + +def int32(nullable: bool = True) -> Schema: + """Create an instance of a signed 32-bit integer type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.int32() + Schema(INT32) + """ + return Schema(Type.INT32, nullable=nullable) + + +def uint32(nullable: bool = True) -> Schema: + """Create an instance of an unsigned 32-bit integer type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.uint32() + Schema(UINT32) + """ + return Schema(Type.UINT32, nullable=nullable) + + +def int64(nullable: bool = True) -> Schema: + """Create an instance of a signed 32-bit integer type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.int64() + Schema(INT64) + """ + return Schema(Type.INT64, nullable=nullable) + + +def uint64(nullable: bool = True) -> Schema: + """Create an instance of an unsigned 32-bit integer type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.uint64() + Schema(UINT64) + """ + return Schema(Type.UINT64, nullable=nullable) + + +def float16(nullable: bool = True) -> Schema: + """Create an instance of a 16-bit floating-point type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.float16() + Schema(HALF_FLOAT) + """ + return Schema(Type.HALF_FLOAT, nullable=nullable) + + +def float32(nullable: bool = True) -> Schema: + """Create an instance of a 32-bit floating-point type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.float32() + Schema(FLOAT) + """ + return Schema(Type.FLOAT, nullable=nullable) + + +def float64(nullable: bool = True) -> Schema: + """Create an instance of a 64-bit floating-point type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.float64() + Schema(DOUBLE) + """ + return Schema(Type.DOUBLE, nullable=nullable) + + +def string(nullable: bool = True) -> Schema: + """Create an instance of a variable-length UTF-8 encoded string type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.string() + Schema(STRING) + """ + return Schema(Type.STRING, nullable=nullable) + + +def large_string(nullable: bool = True) -> Schema: + """Create an instance of a variable-length UTF-8 encoded string type + that uses 64-bit offsets. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.large_string() + Schema(LARGE_STRING) + """ + return Schema(Type.LARGE_STRING, nullable=nullable) + + +def binary(nullable: bool = True) -> Schema: + """Create an instance of a variable or fixed-width binary type. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.binary() + Schema(BINARY) + """ + return Schema(Type.BINARY, nullable=nullable) + + +def large_binary(nullable: bool = True) -> Schema: + """Create an instance of a variable-length binary type that uses 64-bit offsets. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.large_binary() + Schema(LARGE_BINARY) + """ + return Schema(Type.LARGE_BINARY, nullable=nullable) + + +def fixed_size_binary(byte_width: int, nullable: bool = True) -> Schema: + """Create an instance of a variable or fixed-width binary type. + + Parameters + ---------- + byte_width : int + The width of each element in bytes. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.fixed_size_binary(123) + Schema(FIXED_SIZE_BINARY, byte_width=123) + """ + return Schema(Type.FIXED_SIZE_BINARY, byte_width=byte_width, nullable=nullable) + + +def date32(nullable: bool = True) -> Schema: + """Create an instance of a 32-bit date type (days since 1970-01-01). + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.date32() + Schema(DATE32) + """ + return Schema(Type.DATE32, nullable=nullable) + + +def date64(nullable: bool = True) -> Schema: + """Create an instance of a 64-bit date type (milliseconds since 1970-01-01). + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.date64() + Schema(DATE64) + """ + return Schema(Type.DATE64, nullable=nullable) + + +def time32(unit: Union[str, TimeUnit], nullable: bool = True) -> Schema: + """Create an instance of a 32-bit time of day type. + + Parameters + ---------- + unit : str or :class:`TimeUnit` + The unit of values stored by this type. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.time32("s") + Schema(TIME32, unit=SECOND) + """ + return Schema(Type.TIME32, unit=unit, nullable=nullable) + + +def time64(unit: Union[str, TimeUnit], nullable: bool = True) -> Schema: + """Create an instance of a 64-bit time of day type. + + Parameters + ---------- + unit : str or :class:`TimeUnit` + The unit of values stored by this type. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.time64("us") + Schema(TIME64, unit=MICRO) + """ + return Schema(Type.TIME64, unit=unit, nullable=nullable) + + +def timestamp( + unit: Union[str, TimeUnit], timezone: Union[str, None] = None, nullable: bool = True +) -> Schema: + """Create an instance of a timestamp type. + + Parameters + ---------- + unit : str or :class:`TimeUnit` + The unit of values stored by this type. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.timestamp("s") + Schema(TIMESTAMP, unit=SECOND) + >>> na.timestamp("s", timezone="America/Halifax") + Schema(TIMESTAMP, unit=SECOND, timezone='America/Halifax') + """ + return Schema(Type.TIMESTAMP, timezone=timezone, unit=unit, nullable=nullable) + + +def duration(unit, nullable: bool = True): + """Create an instance of a duration type. + + Parameters + ---------- + unit : str or :class:`TimeUnit` + The unit of values stored by this type. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.duration("s") + Schema(DURATION, unit=SECOND) + """ + return Schema(Type.DURATION, unit=unit, nullable=nullable) + + +def interval_months(nullable: bool = True): + """Create an instance of an interval type measured in months. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.interval_months() + Schema(INTERVAL_MONTHS) + """ + return Schema(Type.INTERVAL_MONTHS, nullable=nullable) + + +def interval_day_time(nullable: bool = True): + """Create an instance of an interval type measured as a day/time pair. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.interval_day_time() + Schema(INTERVAL_DAY_TIME) + """ + return Schema(Type.INTERVAL_DAY_TIME, nullable=nullable) + + +def interval_month_day_nano(nullable: bool = True): + """Create an instance of an interval type measured as a month/day/nanosecond + tuple. + + Parameters + ---------- + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.interval_month_day_nano() + Schema(INTERVAL_MONTH_DAY_NANO) + """ + return Schema(Type.INTERVAL_MONTH_DAY_NANO, nullable=nullable) + + +def decimal128(precision: int, scale: int, nullable: bool = True) -> Schema: + """Create an instance of a 128-bit decimal type. + + Parameters + ---------- + precision : int + The number of significant digits representable by this type. Must be + between 1 and 38. + scale : int + The number of digits after the decimal point for values of this type. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.decimal128(10, 3) + Schema(DECIMAL128, precision=10, scale=3) + """ + return Schema(Type.DECIMAL128, precision=precision, scale=scale, nullable=nullable) + + +def decimal256(precision: int, scale: int, nullable: bool = True) -> Schema: + """Create an instance of a 256-bit decimal type. + + Parameters + ---------- + precision : int + The number of significant digits representable by this type. Must be + between 1 and 76. + scale : int + The number of digits after the decimal point for values of this type. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.decimal256(10, 3) + Schema(DECIMAL256, precision=10, scale=3) + """ + return Schema(Type.DECIMAL256, precision=precision, scale=scale, nullable=nullable) + + +def struct(fields, nullable=True) -> Schema: + """Create a type representing a named sequence of fields. + + Parameters + ---------- + fields : + * A dictionary whose keys are field names and values are schema-like objects + * An iterable whose items are a schema like object or a two-tuple of the + field name and a schema-like object. If a field name is not specified + from the tuple, the field name is inherited from the schema-like object. + nullable : bool, optional + Use ``False`` to mark this field as non-nullable. + + Examples + -------- + + >>> import nanoarrow as na + >>> na.struct([na.int32()]) + Schema(STRUCT, fields=[Schema(INT32)]) + >>> na.struct([("col1", na.int32())]) + Schema(STRUCT, fields=[Schema(INT32, name='col1')]) + >>> na.struct({"col1": na.int32()}) + Schema(STRUCT, fields=[Schema(INT32, name='col1')]) + """ + return Schema(Type.STRUCT, fields=fields, nullable=nullable) + + +def _c_schema_from_type_and_params( + type: Type, + params: dict, + name: Union[bool, None, bool], + nullable: Union[bool, None], +): + factory = CSchemaBuilder.allocate() + + if type == Type.STRUCT: + fields = _clean_fields(params.pop("fields")) + + factory.set_format("+s").allocate_children(len(fields)) + for i, item in enumerate(fields): + child_name, c_schema = item + factory.set_child(i, child_name, c_schema) + + elif type.value in CSchemaView._decimal_types: + precision = int(params.pop("precision")) + scale = int(params.pop("scale")) + factory.set_type_decimal(type.value, precision, scale) + + elif type.value in CSchemaView._time_unit_types: + time_unit = params.pop("unit") + if "timezone" in params: + timezone = params.pop("timezone") + else: + timezone = None + + factory.set_type_date_time( + type.value, TimeUnit.create(time_unit).value, timezone + ) + + elif type == Type.FIXED_SIZE_BINARY: + factory.set_type_fixed_size(type.value, int(params.pop("byte_width"))) + + else: + factory.set_type(type.value) + + if params: + unused = ", ".join(f"'{item}'" for item in params.keys()) + raise ValueError(f"Unused parameters whilst constructing Schema: {unused}") + + # Apply default nullability (True) + if nullable is None: + nullable = True + factory.set_nullable(nullable) + + # Apply default name (an empty string). To explicitly set a NULL + # name, a caller would have to specify False. + if name is None: + name = "" + elif name is False: + name = None + factory.set_name(name) + + return factory.finish() + + +def _clean_fields(fields): + if isinstance(fields, dict): + return [(str(k), c_schema(v)) for k, v in fields.items()] + else: + fields_clean = [] + for item in fields: + if isinstance(item, tuple) and len(item) == 2: + fields_clean.append((str(item[0]), c_schema(item[1]))) + else: + fields_clean.append((None, c_schema(item))) + + return fields_clean + + +def _schema_repr(obj): + out = f"Schema({_schema_param_repr('type', obj.type)}" + + if obj.name is None: + out += ", name=False" + elif obj.name: + out += f", name={_schema_param_repr('name', obj.name)}" + + if obj._c_schema_view.type_id not in _PARAM_NAMES: + param_names = [] + else: + param_names = _PARAM_NAMES[obj._c_schema_view.type_id] + + for name in param_names: + value = getattr(obj, name) + if value is None: + continue + out += ", " + param_repr = f"{name}={_schema_param_repr(name, getattr(obj, name))}" + out += param_repr + + if not obj.nullable: + out += ", nullable=False" + + out += ")" + return out + + +def _schema_param_repr(name, value): + if name == "type": + return f"{value.name}" + elif name == "unit": + return f"{value.name}" + elif name == "fields": + # It would be nice to indent this/get it on multiple lines since + # most output will be uncomfortably wide even with the abbreviated repr + return reprlib.Repr().repr(list(value)) + else: + return reprlib.Repr().repr(value) + + +_PARAM_NAMES = { + CArrowType.FIXED_SIZE_BINARY: ("byte_width",), + CArrowType.TIMESTAMP: ("unit", "timezone"), + CArrowType.TIME32: ("unit",), + CArrowType.TIME64: ("unit",), + CArrowType.DURATION: ("unit",), + CArrowType.DECIMAL128: ("precision", "scale"), + CArrowType.DECIMAL256: ("precision", "scale"), + CArrowType.STRUCT: ("fields",), +} diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py new file mode 100644 index 000000000..bc27214b4 --- /dev/null +++ b/python/tests/test_schema.py @@ -0,0 +1,185 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + +import nanoarrow as na + + +def test_type_schema_protocol(): + c_schema = na.c_schema(na.Type.INT32) + assert c_schema.format == "i" + + +def test_time_unit_create(): + assert na.TimeUnit.create("s") == na.TimeUnit.SECOND + assert na.TimeUnit.create("ms") == na.TimeUnit.MILLI + assert na.TimeUnit.create("us") == na.TimeUnit.MICRO + assert na.TimeUnit.create("ns") == na.TimeUnit.NANO + + assert na.TimeUnit.create(na.TimeUnit.SECOND) == na.TimeUnit.SECOND + + +def test_schema_create_c_schema(): + schema_obj = na.int32() + assert schema_obj.type == na.Type.INT32 + + schema_obj2 = na.Schema(schema_obj._c_schema) + assert schema_obj2.type == schema_obj2.type + assert schema_obj2._c_schema is schema_obj._c_schema + + with pytest.raises(ValueError, match="must be unspecified"): + na.Schema(schema_obj._c_schema, some_parameter="some_value") + + with pytest.raises(ValueError, match="must be unspecified"): + na.Schema(schema_obj._c_schema, nullable=True) + + with pytest.raises(ValueError, match="must be unspecified"): + na.Schema(schema_obj._c_schema, name="") + + +def test_schema_create_no_params(): + schema_obj = na.int32() + assert schema_obj.type == na.Type.INT32 + assert schema_obj.nullable is True + assert repr(schema_obj) == "Schema(INT32)" + + schema_obj = na.int32(nullable=False) + assert schema_obj.nullable is False + assert "nullable=False" in repr(schema_obj) + + schema_obj = na.Schema(na.Type.INT32, name=False) + assert schema_obj.name is None + assert "name=False" in repr(schema_obj) + + schema_obj = na.Schema(na.Type.INT32, name="not empty") + assert schema_obj.name == "not empty" + assert "name='not empty'" in repr(schema_obj) + + with pytest.raises(ValueError, match=r"^Unused parameter"): + na.Schema(na.Type.INT32, unused_param="unused_value") + + +def test_schema_simple(): + assert na.null().type == na.Type.NULL + assert na.bool().type == na.Type.BOOL + assert na.int8().type == na.Type.INT8 + assert na.uint8().type == na.Type.UINT8 + assert na.int16().type == na.Type.INT16 + assert na.uint16().type == na.Type.UINT16 + assert na.int32().type == na.Type.INT32 + assert na.uint32().type == na.Type.UINT32 + assert na.int64().type == na.Type.INT64 + assert na.uint64().type == na.Type.UINT64 + assert na.float16().type == na.Type.HALF_FLOAT + assert na.float32().type == na.Type.FLOAT + assert na.float64().type == na.Type.DOUBLE + assert na.string().type == na.Type.STRING + assert na.large_string().type == na.Type.LARGE_STRING + assert na.binary().type == na.Type.BINARY + assert na.large_binary().type == na.Type.LARGE_BINARY + assert na.date32().type == na.Type.DATE32 + assert na.date64().type == na.Type.DATE64 + assert na.interval_months().type == na.Type.INTERVAL_MONTHS + assert na.interval_day_time().type == na.Type.INTERVAL_DAY_TIME + assert na.interval_month_day_nano().type == na.Type.INTERVAL_MONTH_DAY_NANO + + +def test_schema_fixed_size_binary(): + schema_obj = na.fixed_size_binary(byte_width=123) + assert schema_obj.type == na.Type.FIXED_SIZE_BINARY + assert schema_obj.byte_width == 123 + assert "byte_width=123" in repr(schema_obj) + + +def test_schema_time(): + schema_obj = na.time32(na.TimeUnit.SECOND) + assert schema_obj.type == na.Type.TIME32 + assert schema_obj.unit == na.TimeUnit.SECOND + assert "unit=SECOND" in repr(schema_obj) + + schema_obj = na.time64(na.TimeUnit.MICRO) + assert schema_obj.type == na.Type.TIME64 + assert schema_obj.unit == na.TimeUnit.MICRO + assert "unit=MICRO" in repr(schema_obj) + + +def test_schema_timestamp(): + schema_obj = na.timestamp(na.TimeUnit.SECOND) + assert schema_obj.type == na.Type.TIMESTAMP + assert schema_obj.unit == na.TimeUnit.SECOND + assert schema_obj.timezone is None + + schema_obj = na.timestamp(na.TimeUnit.SECOND, timezone="America/Halifax") + assert schema_obj.timezone == "America/Halifax" + assert "timezone='America/Halifax'" in repr(schema_obj) + + +def test_schema_duration(): + schema_obj = na.duration(na.TimeUnit.SECOND) + assert schema_obj.type == na.Type.DURATION + assert schema_obj.unit == na.TimeUnit.SECOND + assert "unit=SECOND" in repr(schema_obj) + + +def test_schema_decimal(): + schema_obj = na.decimal128(10, 3) + assert schema_obj.type == na.Type.DECIMAL128 + assert schema_obj.precision == 10 + assert schema_obj.scale == 3 + assert "precision=10" in repr(schema_obj) + assert "scale=3" in repr(schema_obj) + + schema_obj = na.decimal256(10, 3) + assert schema_obj.type == na.Type.DECIMAL256 + assert schema_obj.precision == 10 + assert schema_obj.scale == 3 + assert "precision=10" in repr(schema_obj) + assert "scale=3" in repr(schema_obj) + + +def test_schema_struct(): + # Make sure we can use just a list + schema_obj = na.struct([na.Type.INT32]) + assert schema_obj.type == na.Type.STRUCT + assert schema_obj.n_fields == 1 + assert schema_obj.field(0).type == na.Type.INT32 + assert schema_obj.field(0).name == "" + for field in schema_obj.fields: + assert isinstance(field, na.Schema) + + assert "fields=[Schema(INT32)]" in repr(schema_obj) + + # Make sure we can use a list of two-tuples + schema_obj = na.struct([("col_name", na.Type.INT32)]) + assert schema_obj.type == na.Type.STRUCT + assert schema_obj.field(0).type == na.Type.INT32 + assert schema_obj.field(0).name == "col_name" + assert "fields=[Schema(INT32, name='col_name')]" in repr(schema_obj) + + # Make sure we can use a dictionary to specify fields + schema_obj = na.struct({"col_name": na.Type.INT32}) + assert schema_obj.type == na.Type.STRUCT + assert schema_obj.field(0).type == na.Type.INT32 + assert schema_obj.field(0).name == "col_name" + + # Make sure we can use a Schema when constructing fields (and that + # fild names are taken from the input) + schema_obj = na.struct([schema_obj.field(0)]) + assert schema_obj.type == na.Type.STRUCT + assert schema_obj.field(0).type == na.Type.INT32 + assert schema_obj.field(0).name == "col_name" diff --git a/src/nanoarrow/schema.c b/src/nanoarrow/schema.c index 9ff1ac734..dd5edb420 100644 --- a/src/nanoarrow/schema.c +++ b/src/nanoarrow/schema.c @@ -296,10 +296,33 @@ ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum Arrow int n_chars; switch (type) { case NANOARROW_TYPE_TIME32: + if (timezone != NULL) { + return EINVAL; + } + + switch (time_unit) { + case NANOARROW_TIME_UNIT_MICRO: + case NANOARROW_TIME_UNIT_NANO: + return EINVAL; + default: + break; + } + + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); + break; case NANOARROW_TYPE_TIME64: if (timezone != NULL) { return EINVAL; } + + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + case NANOARROW_TIME_UNIT_MILLI: + return EINVAL; + default: + break; + } + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); break; case NANOARROW_TYPE_TIMESTAMP: diff --git a/src/nanoarrow/schema_test.cc b/src/nanoarrow/schema_test.cc index e909aee14..d8bfd3c4b 100644 --- a/src/nanoarrow/schema_test.cc +++ b/src/nanoarrow/schema_test.cc @@ -232,6 +232,21 @@ TEST(SchemaTest, SchemaInitDateTime) { NANOARROW_TIME_UNIT_SECOND, "non-null timezone"), EINVAL); + ArrowSchemaInit(&schema); + EXPECT_EQ(ArrowSchemaSetTypeDateTime(&schema, NANOARROW_TYPE_TIME32, + NANOARROW_TIME_UNIT_MICRO, nullptr), + EINVAL); + + ArrowSchemaInit(&schema); + EXPECT_EQ(ArrowSchemaSetTypeDateTime(&schema, NANOARROW_TYPE_TIME64, + NANOARROW_TIME_UNIT_SECOND, "non-null timezone"), + EINVAL); + + ArrowSchemaInit(&schema); + EXPECT_EQ(ArrowSchemaSetTypeDateTime(&schema, NANOARROW_TYPE_TIME64, + NANOARROW_TIME_UNIT_MILLI, nullptr), + EINVAL); + ArrowSchemaInit(&schema); EXPECT_EQ(ArrowSchemaSetTypeDateTime(&schema, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_SECOND, "non-null timezone"),