From 6611236e1a6fc53f497c065f0c26fc9e7d3bb2e7 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 14 Aug 2024 23:26:53 -0400 Subject: [PATCH] Python, GeoParquet: expose `schema_arrow` attribute on ParquetFile and ParquetDataset (#700) Expose the schema before reading any of the actual data. --- .../core/python/geoarrow/rust/core/_rust.pyi | 6 ++++- python/core/src/io/parquet/reader.rs | 23 +++++++++++++++---- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/python/core/python/geoarrow/rust/core/_rust.pyi b/python/core/python/geoarrow/rust/core/_rust.pyi index 2eacf056..9338f434 100644 --- a/python/core/python/geoarrow/rust/core/_rust.pyi +++ b/python/core/python/geoarrow/rust/core/_rust.pyi @@ -14,7 +14,7 @@ from typing import ( overload, ) -from arro3.core import Array, ChunkedArray, RecordBatchReader, Table +from arro3.core import Array, ChunkedArray, RecordBatchReader, Table, Schema try: import numpy as np @@ -916,6 +916,8 @@ class ParquetFile: def num_rows(self) -> int: ... @property def num_row_groups(self) -> int: ... + @property + def schema_arrow(self) -> Schema: ... def row_group_bounds( self, minx_path: Sequence[str], @@ -959,6 +961,8 @@ class ParquetDataset: def num_rows(self) -> int: ... @property def num_row_groups(self) -> int: ... + @property + def schema_arrow(self) -> Schema: ... async def read_async( self, *, diff --git a/python/core/src/io/parquet/reader.rs b/python/core/src/io/parquet/reader.rs index 3d7b73c0..fd3d523f 100644 --- a/python/core/src/io/parquet/reader.rs +++ b/python/core/src/io/parquet/reader.rs @@ -19,10 +19,11 @@ use geoarrow::io::parquet::{ }; use geoarrow::table::Table; use object_store::{ObjectMeta, ObjectStore}; -use parquet::arrow::arrow_reader::ArrowReaderMetadata; +use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; use parquet::arrow::async_reader::ParquetObjectReader; use pyo3::exceptions::{PyFileNotFoundError, PyValueError}; use pyo3::prelude::*; +use pyo3_arrow::PySchema; use tokio::runtime::Runtime; /// Read a GeoParquet file from a path on disk into a GeoTable. @@ -92,7 +93,7 @@ pub fn read_parquet( let table = GeoParquetRecordBatchStreamBuilder::try_new_with_options( reader, - Default::default(), + ArrowReaderOptions::new().with_page_index(true), geo_options, ) .await? @@ -117,7 +118,7 @@ pub fn read_parquet( let table = GeoParquetRecordBatchReaderBuilder::try_new_with_options( file, - Default::default(), + ArrowReaderOptions::new().with_page_index(true), geo_options, )? .build()? @@ -189,7 +190,7 @@ pub fn read_parquet_async( let table = GeoParquetRecordBatchStreamBuilder::try_new_with_options( reader, - Default::default(), + ArrowReaderOptions::new().with_page_index(true), geo_options, ) .await @@ -268,6 +269,13 @@ impl ParquetFile { self.geoparquet_meta.num_row_groups() } + /// Access the Arrow schema of the generated data + #[getter] + fn schema_arrow(&self, py: Python) -> PyGeoArrowResult { + let schema = self.geoparquet_meta.resolved_schema(Default::default())?; + Ok(PySchema::new(schema).to_arro3(py)?) + } + /// Get the bounds of a single row group. /// /// As of GeoParquet 1.1 you won't need to pass in these column names, as they'll be specified @@ -549,6 +557,13 @@ impl ParquetDataset { self.meta.num_row_groups() } + /// Access the Arrow schema of the generated data + #[getter] + fn schema_arrow(&self, py: Python) -> PyGeoArrowResult { + let schema = self.meta.resolved_schema(Default::default())?; + Ok(PySchema::new(schema).to_arro3(py)?) + } + /// Read this entire file in an async fashion. #[pyo3(signature = (*, batch_size=None, limit=None, offset=None, bbox=None, bbox_paths=None))] fn read_async(