Skip to content

Commit

Permalink
Python, GeoParquet: expose schema_arrow attribute on ParquetFile an…
Browse files Browse the repository at this point in the history
…d ParquetDataset (#700)

Expose the schema before reading any of the actual data.
  • Loading branch information
kylebarron authored Aug 15, 2024
1 parent 667389b commit 6611236
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 5 deletions.
6 changes: 5 additions & 1 deletion python/core/python/geoarrow/rust/core/_rust.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ from typing import (
overload,
)

from arro3.core import Array, ChunkedArray, RecordBatchReader, Table
from arro3.core import Array, ChunkedArray, RecordBatchReader, Table, Schema

try:
import numpy as np
Expand Down Expand Up @@ -916,6 +916,8 @@ class ParquetFile:
def num_rows(self) -> int: ...
@property
def num_row_groups(self) -> int: ...
@property
def schema_arrow(self) -> Schema: ...
def row_group_bounds(
self,
minx_path: Sequence[str],
Expand Down Expand Up @@ -959,6 +961,8 @@ class ParquetDataset:
def num_rows(self) -> int: ...
@property
def num_row_groups(self) -> int: ...
@property
def schema_arrow(self) -> Schema: ...
async def read_async(
self,
*,
Expand Down
23 changes: 19 additions & 4 deletions python/core/src/io/parquet/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ use geoarrow::io::parquet::{
};
use geoarrow::table::Table;
use object_store::{ObjectMeta, ObjectStore};
use parquet::arrow::arrow_reader::ArrowReaderMetadata;
use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
use parquet::arrow::async_reader::ParquetObjectReader;
use pyo3::exceptions::{PyFileNotFoundError, PyValueError};
use pyo3::prelude::*;
use pyo3_arrow::PySchema;
use tokio::runtime::Runtime;

/// Read a GeoParquet file from a path on disk into a GeoTable.
Expand Down Expand Up @@ -92,7 +93,7 @@ pub fn read_parquet(

let table = GeoParquetRecordBatchStreamBuilder::try_new_with_options(
reader,
Default::default(),
ArrowReaderOptions::new().with_page_index(true),
geo_options,
)
.await?
Expand All @@ -117,7 +118,7 @@ pub fn read_parquet(

let table = GeoParquetRecordBatchReaderBuilder::try_new_with_options(
file,
Default::default(),
ArrowReaderOptions::new().with_page_index(true),
geo_options,
)?
.build()?
Expand Down Expand Up @@ -189,7 +190,7 @@ pub fn read_parquet_async(

let table = GeoParquetRecordBatchStreamBuilder::try_new_with_options(
reader,
Default::default(),
ArrowReaderOptions::new().with_page_index(true),
geo_options,
)
.await
Expand Down Expand Up @@ -268,6 +269,13 @@ impl ParquetFile {
self.geoparquet_meta.num_row_groups()
}

/// Access the Arrow schema of the generated data
#[getter]
fn schema_arrow(&self, py: Python) -> PyGeoArrowResult<PyObject> {
let schema = self.geoparquet_meta.resolved_schema(Default::default())?;
Ok(PySchema::new(schema).to_arro3(py)?)
}

/// Get the bounds of a single row group.
///
/// As of GeoParquet 1.1 you won't need to pass in these column names, as they'll be specified
Expand Down Expand Up @@ -549,6 +557,13 @@ impl ParquetDataset {
self.meta.num_row_groups()
}

/// Access the Arrow schema of the generated data
#[getter]
fn schema_arrow(&self, py: Python) -> PyGeoArrowResult<PyObject> {
let schema = self.meta.resolved_schema(Default::default())?;
Ok(PySchema::new(schema).to_arro3(py)?)
}

/// Read this entire file in an async fashion.
#[pyo3(signature = (*, batch_size=None, limit=None, offset=None, bbox=None, bbox_paths=None))]
fn read_async(
Expand Down

0 comments on commit 6611236

Please sign in to comment.