Python, GeoParquet: expose schema_arrow attribute on ParquetFile an…

…d ParquetDataset (#700) Expose the schema before reading any of the actual data.
geoarrow · Aug 15, 2024 · 6611236 · 6611236
1 parent 667389b
commit 6611236
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 5 deletions.
diff --git a/python/core/python/geoarrow/rust/core/_rust.pyi b/python/core/python/geoarrow/rust/core/_rust.pyi
@@ -14,7 +14,7 @@ from typing import (
     overload,
 )
 
-from arro3.core import Array, ChunkedArray, RecordBatchReader, Table
+from arro3.core import Array, ChunkedArray, RecordBatchReader, Table, Schema
 
 try:
     import numpy as np
@@ -916,6 +916,8 @@ class ParquetFile:
     def num_rows(self) -> int: ...
     @property
     def num_row_groups(self) -> int: ...
+    @property
+    def schema_arrow(self) -> Schema: ...
     def row_group_bounds(
         self,
         minx_path: Sequence[str],
@@ -959,6 +961,8 @@ class ParquetDataset:
     def num_rows(self) -> int: ...
     @property
     def num_row_groups(self) -> int: ...
+    @property
+    def schema_arrow(self) -> Schema: ...
     async def read_async(
         self,
         *,

diff --git a/python/core/src/io/parquet/reader.rs b/python/core/src/io/parquet/reader.rs
@@ -19,10 +19,11 @@ use geoarrow::io::parquet::{
 };
 use geoarrow::table::Table;
 use object_store::{ObjectMeta, ObjectStore};
-use parquet::arrow::arrow_reader::ArrowReaderMetadata;
+use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
 use parquet::arrow::async_reader::ParquetObjectReader;
 use pyo3::exceptions::{PyFileNotFoundError, PyValueError};
 use pyo3::prelude::*;
+use pyo3_arrow::PySchema;
 use tokio::runtime::Runtime;
 
 /// Read a GeoParquet file from a path on disk into a GeoTable.
@@ -92,7 +93,7 @@ pub fn read_parquet(
 
                 let table = GeoParquetRecordBatchStreamBuilder::try_new_with_options(
                     reader,
-                    Default::default(),
+                    ArrowReaderOptions::new().with_page_index(true),
                     geo_options,
                 )
                 .await?
@@ -117,7 +118,7 @@ pub fn read_parquet(
 
                 let table = GeoParquetRecordBatchReaderBuilder::try_new_with_options(
                     file,
-                    Default::default(),
+                    ArrowReaderOptions::new().with_page_index(true),
                     geo_options,
                 )?
                 .build()?
@@ -189,7 +190,7 @@ pub fn read_parquet_async(
 
                 let table = GeoParquetRecordBatchStreamBuilder::try_new_with_options(
                     reader,
-                    Default::default(),
+                    ArrowReaderOptions::new().with_page_index(true),
                     geo_options,
                 )
                 .await
@@ -268,6 +269,13 @@ impl ParquetFile {
         self.geoparquet_meta.num_row_groups()
     }
 
+    /// Access the Arrow schema of the generated data
+    #[getter]
+    fn schema_arrow(&self, py: Python) -> PyGeoArrowResult<PyObject> {
+        let schema = self.geoparquet_meta.resolved_schema(Default::default())?;
+        Ok(PySchema::new(schema).to_arro3(py)?)
+    }
+
     /// Get the bounds of a single row group.
     ///
     /// As of GeoParquet 1.1 you won't need to pass in these column names, as they'll be specified
@@ -549,6 +557,13 @@ impl ParquetDataset {
         self.meta.num_row_groups()
     }
 
+    /// Access the Arrow schema of the generated data
+    #[getter]
+    fn schema_arrow(&self, py: Python) -> PyGeoArrowResult<PyObject> {
+        let schema = self.meta.resolved_schema(Default::default())?;
+        Ok(PySchema::new(schema).to_arro3(py)?)
+    }
+
     /// Read this entire file in an async fashion.
     #[pyo3(signature = (*, batch_size=None, limit=None, offset=None, bbox=None, bbox_paths=None))]
     fn read_async(