From ad41bb8605a1814e97e987f2a78991ae3c82a897 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 6 Nov 2024 00:43:59 +0100 Subject: [PATCH] OGRLayer::GetArrowStream(): add a DATETIME_AS_STRING=YES/NO option DATETIME_AS_STRING=YES/NO. Defaults to NO. Added in GDAL 3.11. Whether DateTime fields should be returned as a (normally ISO-8601 formatted) string by drivers. The aim is to be able to handle mixed timezones (or timezone naive values) in the same column. All drivers must honour that option, and potentially fallback to the OGRLayer generic implementation if they cannot (which is the case for the Arrow, Parquet and ADBC drivers). When DATETIME_AS_STRING=YES, the TIMEZONE option is ignored. Fixes https://github.com/geopandas/pyogrio/issues/487 --- autotest/ogr/ogr_adbc.py | 30 ++++ autotest/ogr/ogr_flatgeobuf.py | 49 ++++++ autotest/ogr/ogr_gpkg.py | 73 +++++++++ autotest/ogr/ogr_mem.py | 43 ++++++ autotest/ogr/ogr_parquet.py | 22 +++ ogr/ogrsf_frmts/adbc/ogradbclayer.cpp | 5 +- .../arrow_common/ograrrowlayer.hpp | 18 +++ .../flatgeobuf/ogrflatgeobuflayer.cpp | 99 ++++++------ .../generic/ograrrowarrayhelper.cpp | 23 ++- ogr/ogrsf_frmts/generic/ogrlayerarrow.cpp | 146 +++++++++++++++++- ogr/ogrsf_frmts/generic/ogrlayerarrow.h | 3 + ogr/ogrsf_frmts/gpkg/ogr_geopackage.h | 1 + ogr/ogrsf_frmts/gpkg/ogrgeopackagelayer.cpp | 26 +++- .../gpkg/ogrgeopackagetablelayer.cpp | 34 ++-- 14 files changed, 499 insertions(+), 73 deletions(-) diff --git a/autotest/ogr/ogr_adbc.py b/autotest/ogr/ogr_adbc.py index 8ee1da12a50f..7a6f8b93c0e0 100755 --- a/autotest/ogr/ogr_adbc.py +++ b/autotest/ogr/ogr_adbc.py @@ -326,6 +326,36 @@ def test_ogr_adbc_test_ogrsf_parquet_filename_with_glob(): assert "ERROR" not in ret +############################################################################### +# Test DATETIME_AS_STRING=YES GetArrowStream() option + + +def test_ogr_adbc_arrow_stream_numpy_datetime_as_string(tmp_vsimem): + pytest.importorskip("osgeo.gdal_array") + pytest.importorskip("numpy") + + if not _has_libduckdb(): + pytest.skip("libduckdb.so missing") + + with gdal.OpenEx( + "data/parquet/test.parquet", gdal.OF_VECTOR, allowed_drivers=["ADBC"] + ) as ds: + lyr = ds.GetLayer(0) + stream = lyr.GetArrowStreamAsNumPy( + options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"] + ) + batches = [batch for batch in stream] + batch = batches[0] + # Should be "2019-01-01T14:00:00.500-02:15" but DuckDB returns in UTC + # On my machine, for some reason it returns without the Z, whereas on + # the ubuntu_2404 it returns with the Z... despite both using libduckdb 1.1.3 + # at time of writing... + assert batch["timestamp_ms_gmt_minus_0215"][0] in ( + b"2019-01-01T16:15:00.500", + b"2019-01-01T16:15:00.500Z", + ) + + ############################################################################### # Run test_ogrsf on a DuckDB dataset diff --git a/autotest/ogr/ogr_flatgeobuf.py b/autotest/ogr/ogr_flatgeobuf.py index bdbd0819e106..6e68443010ea 100644 --- a/autotest/ogr/ogr_flatgeobuf.py +++ b/autotest/ogr/ogr_flatgeobuf.py @@ -1564,3 +1564,52 @@ def test_ogr_flatgeobuf_sql_arrow(tmp_vsimem): assert f["bar"] == "baz" assert f.GetGeometryRef().ExportToWkt() == "POINT (1 2)" f = tmp_lyr.GetNextFeature() + + +############################################################################### +# Test DATETIME_AS_STRING=YES GetArrowStream() option + + +def test_ogr_flatgeobuf_arrow_stream_numpy_datetime_as_string(tmp_vsimem): + pytest.importorskip("osgeo.gdal_array") + pytest.importorskip("numpy") + + filename = str(tmp_vsimem / "datetime_as_string.fgb") + with ogr.GetDriverByName("FlatGeoBuf").CreateDataSource(filename) as ds: + lyr = ds.CreateLayer("test") + + field = ogr.FieldDefn("datetime", ogr.OFTDateTime) + lyr.CreateField(field) + + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)")) + lyr.CreateFeature(f) + + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetField("datetime", "2022-05-31T12:34:56.789Z") + f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)")) + lyr.CreateFeature(f) + + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetField("datetime", "2022-05-31T12:34:56") + f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)")) + lyr.CreateFeature(f) + + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetField("datetime", "2022-05-31T12:34:56+12:30") + f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)")) + lyr.CreateFeature(f) + + with ogr.Open(filename) as ds: + lyr = ds.GetLayer(0) + stream = lyr.GetArrowStreamAsNumPy( + options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"] + ) + batches = [batch for batch in stream] + assert len(batches) == 1 + batch = batches[0] + assert len(batch["datetime"]) == 4 + assert batch["datetime"][0] == b"" + assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z" + assert batch["datetime"][2] == b"2022-05-31T12:34:56" + assert batch["datetime"][3] == b"2022-05-31T12:34:56+12:30" diff --git a/autotest/ogr/ogr_gpkg.py b/autotest/ogr/ogr_gpkg.py index 78e882d94892..0654d5e07945 100755 --- a/autotest/ogr/ogr_gpkg.py +++ b/autotest/ogr/ogr_gpkg.py @@ -10766,3 +10766,76 @@ def test_gpkg_secure_delete(tmp_vsimem): with ds.ExecuteSQL("PRAGMA secure_delete") as sql_lyr: f = sql_lyr.GetNextFeature() assert f.GetField(0) == 0 + + +############################################################################### +# Test DATETIME_AS_STRING=YES GetArrowStream() option + + +def test_ogr_gpkg_arrow_stream_numpy_datetime_as_string(tmp_vsimem): + pytest.importorskip("osgeo.gdal_array") + pytest.importorskip("numpy") + + filename = str(tmp_vsimem / "datetime_as_string.gpkg") + ds = ogr.GetDriverByName("GPKG").CreateDataSource(filename) + lyr = ds.CreateLayer("test") + + field = ogr.FieldDefn("datetime", ogr.OFTDateTime) + lyr.CreateField(field) + + f = ogr.Feature(lyr.GetLayerDefn()) + lyr.CreateFeature(f) + + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetField("datetime", "2022-05-31T12:34:56.789Z") + lyr.CreateFeature(f) + + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetField("datetime", "2022-05-31T12:34:56.000") + lyr.CreateFeature(f) + + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetField("datetime", "2022-05-31T12:34:56.000+12:30") + lyr.CreateFeature(f) + + # Test DATETIME_AS_STRING=YES + stream = lyr.GetArrowStreamAsNumPy( + options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"] + ) + batches = [batch for batch in stream] + assert len(batches) == 1 + batch = batches[0] + assert len(batch["datetime"]) == 4 + assert batch["datetime"][0] == b"" + assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z" + assert batch["datetime"][2] == b"2022-05-31T12:34:56.000" + assert batch["datetime"][3] == b"2022-05-31T12:34:56.000+12:30" + + # Setting a filer tests the use of the less optimized + # OGRGeoPackageTableLayer::GetNextArray() implementation + lyr.SetAttributeFilter("1 = 1") + stream = lyr.GetArrowStreamAsNumPy( + options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"] + ) + lyr.SetAttributeFilter(None) + batches = [batch for batch in stream] + assert len(batches) == 1 + batch = batches[0] + assert len(batch["datetime"]) == 4 + assert batch["datetime"][0] == b"" + assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z" + assert batch["datetime"][2] == b"2022-05-31T12:34:56.000" + assert batch["datetime"][3] == b"2022-05-31T12:34:56.000+12:30" + + with ds.ExecuteSQL("SELECT * FROM test") as sql_lyr: + stream = sql_lyr.GetArrowStreamAsNumPy( + options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"] + ) + batches = [batch for batch in stream] + assert len(batches) == 1 + batch = batches[0] + assert len(batch["datetime"]) == 4 + assert batch["datetime"][0] == b"" + assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z" + assert batch["datetime"][2] == b"2022-05-31T12:34:56.000" + assert batch["datetime"][3] == b"2022-05-31T12:34:56.000+12:30" diff --git a/autotest/ogr/ogr_mem.py b/autotest/ogr/ogr_mem.py index 3d26ac31a00b..c5918ea20085 100755 --- a/autotest/ogr/ogr_mem.py +++ b/autotest/ogr/ogr_mem.py @@ -979,6 +979,49 @@ def test_ogr_mem_arrow_stream_numpy(): assert len(batches) == 0 +############################################################################### +# Test DATETIME_AS_STRING=YES GetArrowStream() option + + +def test_ogr_mem_arrow_stream_numpy_datetime_as_string(): + pytest.importorskip("osgeo.gdal_array") + pytest.importorskip("numpy") + + ds = ogr.GetDriverByName("Memory").CreateDataSource("") + lyr = ds.CreateLayer("foo") + + field = ogr.FieldDefn("datetime", ogr.OFTDateTime) + lyr.CreateField(field) + + f = ogr.Feature(lyr.GetLayerDefn()) + lyr.CreateFeature(f) + + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetField("datetime", "2022-05-31T12:34:56.789Z") + lyr.CreateFeature(f) + + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetField("datetime", "2022-05-31T12:34:56") + lyr.CreateFeature(f) + + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetField("datetime", "2022-05-31T12:34:56+12:30") + lyr.CreateFeature(f) + + # Test DATETIME_AS_STRING=YES + stream = lyr.GetArrowStreamAsNumPy( + options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"] + ) + batches = [batch for batch in stream] + assert len(batches) == 1 + batch = batches[0] + assert len(batch["datetime"]) == 4 + assert batch["datetime"][0] == b"" + assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z" + assert batch["datetime"][2] == b"2022-05-31T12:34:56" + assert batch["datetime"][3] == b"2022-05-31T12:34:56+12:30" + + ############################################################################### diff --git a/autotest/ogr/ogr_parquet.py b/autotest/ogr/ogr_parquet.py index b89fc1673b1e..1608ea58c86d 100755 --- a/autotest/ogr/ogr_parquet.py +++ b/autotest/ogr/ogr_parquet.py @@ -4150,3 +4150,25 @@ def test_ogr_parquet_IsArrowSchemaSupported_arrow_15_types( success, error_msg = dst_lyr.IsArrowSchemaSupported(schema) assert not success assert error_msg == expected_error_msg + + +############################################################################### +# Test DATETIME_AS_STRING=YES GetArrowStream() option + + +def test_ogr_parquet_arrow_stream_numpy_datetime_as_string(tmp_vsimem): + pytest.importorskip("osgeo.gdal_array") + pytest.importorskip("numpy") + + with gdal.OpenEx( + "data/parquet/test.parquet", gdal.OF_VECTOR, allowed_drivers=["Parquet"] + ) as ds: + lyr = ds.GetLayer(0) + stream = lyr.GetArrowStreamAsNumPy( + options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"] + ) + batches = [batch for batch in stream] + batch = batches[0] + assert ( + batch["timestamp_ms_gmt_minus_0215"][0] == b"2019-01-01T14:00:00.500-02:15" + ) diff --git a/ogr/ogrsf_frmts/adbc/ogradbclayer.cpp b/ogr/ogrsf_frmts/adbc/ogradbclayer.cpp index 02a7b45d7c2e..66169b486dca 100644 --- a/ogr/ogrsf_frmts/adbc/ogradbclayer.cpp +++ b/ogr/ogrsf_frmts/adbc/ogradbclayer.cpp @@ -155,8 +155,11 @@ GDALDataset *OGRADBCLayer::GetDataset() bool OGRADBCLayer::GetArrowStream(struct ArrowArrayStream *out_stream, CSLConstList papszOptions) { - if (m_poFilterGeom || m_poAttrQuery) + if (m_poFilterGeom || m_poAttrQuery || + CPLFetchBool(papszOptions, GAS_OPT_DATETIME_AS_STRING, false)) + { return OGRLayer::GetArrowStream(out_stream, papszOptions); + } if (m_stream) { diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp index a52cf4e4227f..bc6453618ae3 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp @@ -5490,6 +5490,24 @@ inline bool OGRArrowLayer::UseRecordBatchBaseImplementation() const return true; } + if (m_aosArrowArrayStreamOptions.FetchBool(GAS_OPT_DATETIME_AS_STRING, + false)) + { + const int nFieldCount = m_poFeatureDefn->GetFieldCount(); + for (int i = 0; i < nFieldCount; ++i) + { + const auto poFieldDefn = m_poFeatureDefn->GetFieldDefn(i); + if (!poFieldDefn->IsIgnored() && + poFieldDefn->GetType() == OFTDateTime) + { + CPLDebug("ARROW", + "DATETIME_AS_STRING=YES not compatible of fast " + "Arrow implementation"); + return true; + } + } + } + if (EQUAL(m_aosArrowArrayStreamOptions.FetchNameValueDef( "GEOMETRY_ENCODING", ""), "WKB")) diff --git a/ogr/ogrsf_frmts/flatgeobuf/ogrflatgeobuflayer.cpp b/ogr/ogrsf_frmts/flatgeobuf/ogrflatgeobuflayer.cpp index fc02bb18a4db..22767210a8df 100644 --- a/ogr/ogrsf_frmts/flatgeobuf/ogrflatgeobuflayer.cpp +++ b/ogr/ogrsf_frmts/flatgeobuf/ogrflatgeobuflayer.cpp @@ -18,6 +18,7 @@ #include "cpl_time.h" #include "ogr_p.h" #include "ograrrowarrayhelper.h" +#include "ogrlayerarrow.h" #include "ogr_recordbatch.h" #include "ogr_flatgeobuf.h" @@ -1481,6 +1482,8 @@ int OGRFlatGeobufLayer::GetNextArrowArray(struct ArrowArrayStream *stream, } const GIntBig nFeatureIdxStart = m_featuresPos; + const bool bDateTimeAsString = m_aosArrowArrayStreamOptions.FetchBool( + GAS_OPT_DATETIME_AS_STRING, false); const uint32_t nMemLimit = OGRArrowArrayHelper::GetMemLimit(); while (iFeat < sHelper.m_nMaxBatchSize) @@ -1851,6 +1854,58 @@ int OGRFlatGeobufLayer::GetNextArrowArray(struct ArrowArrayStream *stream, offset += sizeof(double); break; + case ColumnType::DateTime: + { + if (!bDateTimeAsString) + { + if (offset + sizeof(uint32_t) > size) + { + CPLErrorInvalidSize("datetime length "); + goto error; + } + uint32_t len; + memcpy(&len, data + offset, sizeof(int32_t)); + CPL_LSBPTR32(&len); + offset += sizeof(uint32_t); + if (len > size - offset || len > 32) + { + CPLErrorInvalidSize("datetime value"); + goto error; + } + if (!isIgnored) + { + OGRField ogrField; + if (ParseDateTime( + reinterpret_cast(data + + offset), + len, &ogrField)) + { + sHelper.SetDateTime( + psArray, iFeat, brokenDown, + sHelper.m_anTZFlags[i], ogrField); + } + else + { + char str[32 + 1]; + memcpy(str, data + offset, len); + str[len] = '\0'; + if (OGRParseDate(str, &ogrField, 0)) + { + sHelper.SetDateTime( + psArray, iFeat, brokenDown, + sHelper.m_anTZFlags[i], ogrField); + } + } + } + offset += len; + break; + } + else + { + [[fallthrough]]; + } + } + case ColumnType::String: case ColumnType::Json: case ColumnType::Binary: @@ -1896,50 +1951,6 @@ int OGRFlatGeobufLayer::GetNextArrowArray(struct ArrowArrayStream *stream, offset += len; break; } - - case ColumnType::DateTime: - { - if (offset + sizeof(uint32_t) > size) - { - CPLErrorInvalidSize("datetime length "); - goto error; - } - uint32_t len; - memcpy(&len, data + offset, sizeof(int32_t)); - CPL_LSBPTR32(&len); - offset += sizeof(uint32_t); - if (len > size - offset || len > 32) - { - CPLErrorInvalidSize("datetime value"); - goto error; - } - if (!isIgnored) - { - OGRField ogrField; - if (ParseDateTime(reinterpret_cast( - data + offset), - len, &ogrField)) - { - sHelper.SetDateTime(psArray, iFeat, brokenDown, - sHelper.m_anTZFlags[i], - ogrField); - } - else - { - char str[32 + 1]; - memcpy(str, data + offset, len); - str[len] = '\0'; - if (OGRParseDate(str, &ogrField, 0)) - { - sHelper.SetDateTime( - psArray, iFeat, brokenDown, - sHelper.m_anTZFlags[i], ogrField); - } - } - } - offset += len; - break; - } } } } diff --git a/ogr/ogrsf_frmts/generic/ograrrowarrayhelper.cpp b/ogr/ogrsf_frmts/generic/ograrrowarrayhelper.cpp index e17a378e68ba..37a3de83c3c3 100644 --- a/ogr/ogrsf_frmts/generic/ograrrowarrayhelper.cpp +++ b/ogr/ogrsf_frmts/generic/ograrrowarrayhelper.cpp @@ -11,6 +11,7 @@ ****************************************************************************/ #include "ograrrowarrayhelper.h" +#include "ogrlayerarrow.h" #include "ogr_p.h" #include @@ -94,6 +95,8 @@ OGRArrowArrayHelper::OGRArrowArrayHelper( nTZFlagOverride = OGR_TZFLAG_UTC; } } + const bool bDateTimeAsString = + aosArrowArrayStreamOptions.FetchBool(GAS_OPT_DATETIME_AS_STRING, false); if (m_bIncludeFID) { @@ -222,6 +225,20 @@ OGRArrowArrayHelper::OGRArrowArrayHelper( } break; } + + case OFTDateTime: + { + if (!bDateTimeAsString) + { + nEltSize = sizeof(int64_t); + break; + } + else + { + [[fallthrough]]; + } + } + case OFTString: case OFTBinary: { @@ -256,12 +273,6 @@ OGRArrowArrayHelper::OGRArrowArrayHelper( break; } - case OFTDateTime: - { - nEltSize = sizeof(int64_t); - break; - } - default: break; } diff --git a/ogr/ogrsf_frmts/generic/ogrlayerarrow.cpp b/ogr/ogrsf_frmts/generic/ogrlayerarrow.cpp index dae75364e787..b51184064d50 100644 --- a/ogr/ogrsf_frmts/generic/ogrlayerarrow.cpp +++ b/ogr/ogrsf_frmts/generic/ogrlayerarrow.cpp @@ -24,6 +24,8 @@ #include "cpl_float.h" #include "cpl_json.h" #include "cpl_time.h" + +#include #include #include #include @@ -376,6 +378,8 @@ int OGRLayer::GetArrowSchema(struct ArrowArrayStream *, { const bool bIncludeFID = CPLTestBool( m_aosArrowArrayStreamOptions.FetchNameValueDef("INCLUDE_FID", "YES")); + const bool bDateTimeAsString = m_aosArrowArrayStreamOptions.FetchBool( + GAS_OPT_DATETIME_AS_STRING, false); memset(out_schema, 0, sizeof(*out_schema)); out_schema->format = "+s"; out_schema->name = CPLStrdup(""); @@ -523,7 +527,11 @@ int OGRLayer::GetArrowSchema(struct ArrowArrayStream *, const char *pszPrefix = "tsm:"; const char *pszTZOverride = m_aosArrowArrayStreamOptions.FetchNameValue("TIMEZONE"); - if (pszTZOverride && EQUAL(pszTZOverride, "unknown")) + if (bDateTimeAsString) + { + psChild->format = "u"; + } + else if (pszTZOverride && EQUAL(pszTZOverride, "unknown")) { psChild->format = CPLStrdup(pszPrefix); } @@ -1812,6 +1820,99 @@ FillDateTimeArray(struct ArrowArray *psChild, return true; } +/************************************************************************/ +/* FillDateTimeArrayAsString() */ +/************************************************************************/ + +static size_t +FillDateTimeArrayAsString(struct ArrowArray *psChild, + std::deque> &apoFeatures, + const size_t nFeatureCountLimit, + const bool bIsNullable, const int i, + const size_t nMemLimit) +{ + psChild->n_buffers = 3; + psChild->buffers = static_cast(CPLCalloc(3, sizeof(void *))); + uint8_t *pabyValidity = nullptr; + using T = uint32_t; + T *panOffsets = static_cast( + VSI_MALLOC_ALIGNED_AUTO_VERBOSE(sizeof(T) * (1 + nFeatureCountLimit))); + if (panOffsets == nullptr) + return 0; + psChild->buffers[1] = panOffsets; + + size_t nOffset = 0; + size_t nFeatCount = 0; + for (size_t iFeat = 0; iFeat < nFeatureCountLimit; ++iFeat, ++nFeatCount) + { + panOffsets[iFeat] = static_cast(nOffset); + const auto psRawField = apoFeatures[iFeat]->GetRawFieldRef(i); + if (IsValidField(psRawField)) + { + size_t nLen = strlen("YYYY-MM-DDTHH:MM:SS"); + if (fmodf(psRawField->Date.Second, 1.0f) != 0) + nLen += strlen(".sss"); + if (psRawField->Date.TZFlag == OGR_TZFLAG_UTC) + nLen += 1; // 'Z' + else if (psRawField->Date.TZFlag > OGR_TZFLAG_MIXED_TZ) + nLen += strlen("+hh:mm"); + if (nLen > nMemLimit - nOffset) + { + if (nFeatCount == 0) + return 0; + break; + } + nOffset += static_cast(nLen); + } + else if (bIsNullable) + { + ++psChild->null_count; + if (pabyValidity == nullptr) + { + pabyValidity = AllocValidityBitmap(nFeatureCountLimit); + psChild->buffers[0] = pabyValidity; + if (pabyValidity == nullptr) + return 0; + } + UnsetBit(pabyValidity, iFeat); + } + } + panOffsets[nFeatCount] = static_cast(nOffset); + + char *pachValues = + static_cast(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(nOffset + 1)); + if (pachValues == nullptr) + return 0; + psChild->buffers[2] = pachValues; + + nOffset = 0; + char szBuffer[OGR_SIZEOF_ISO8601_DATETIME_BUFFER]; + OGRISO8601Format sFormat; + sFormat.ePrecision = OGRISO8601Precision::AUTO; + for (size_t iFeat = 0; iFeat < nFeatCount; ++iFeat) + { + const int nLen = + static_cast(panOffsets[iFeat + 1] - panOffsets[iFeat]); + if (nLen) + { + const auto psRawField = apoFeatures[iFeat]->GetRawFieldRef(i); + int nBufSize = OGRGetISO8601DateTime(psRawField, sFormat, szBuffer); + if (nBufSize) + { + memcpy(pachValues + nOffset, szBuffer, + std::min(nLen, nBufSize)); + } + if (nBufSize < nLen) + { + memset(pachValues + nOffset + nBufSize, 0, nLen - nBufSize); + } + nOffset += nLen; + } + } + + return nFeatCount; +} + /************************************************************************/ /* GetNextArrowArray() */ /************************************************************************/ @@ -1832,6 +1933,8 @@ int OGRLayer::GetNextArrowArray(struct ArrowArrayStream *stream, const bool bIncludeFID = CPLTestBool( m_aosArrowArrayStreamOptions.FetchNameValueDef("INCLUDE_FID", "YES")); + const bool bDateTimeAsString = m_aosArrowArrayStreamOptions.FetchBool( + GAS_OPT_DATETIME_AS_STRING, false); int nMaxBatchSize = atoi(m_aosArrowArrayStreamOptions.FetchNameValueDef( "MAX_FEATURES_IN_BATCH", "65536")); if (nMaxBatchSize <= 0) @@ -2179,10 +2282,25 @@ int OGRLayer::GetNextArrowArray(struct ArrowArrayStream *stream, case OFTDateTime: { - if (!FillDateTimeArray(psChild, oFeatureQueue, nFeatureCount, - bIsNullable, i, - poFieldDefn->GetTZFlag())) - goto error; + if (bDateTimeAsString) + { + const size_t nThisFeatureCount = FillDateTimeArrayAsString( + psChild, oFeatureQueue, nFeatureCount, bIsNullable, i, + nMemLimit); + if (nThisFeatureCount == 0) + { + goto error_max_mem; + } + if (nThisFeatureCount < nFeatureCount) + nFeatureCount = nThisFeatureCount; + } + else + { + if (!FillDateTimeArray(psChild, oFeatureQueue, + nFeatureCount, bIsNullable, i, + poFieldDefn->GetTZFlag())) + goto error; + } break; } } @@ -2419,6 +2537,15 @@ From OGR using the Arrow C Stream data interface tutorial. * to UTC of a OGRField::Date is only done if both the timezone indicated by * OGRField::Date::TZFlag and the one at the OGRFieldDefn level (or set by * this TIMEZONE option) are not unknown. + *
  • DATETIME_AS_STRING=YES/NO. Defaults to NO. Added in GDAL 3.11. + * Whether DateTime fields should be returned as a (normally ISO-8601 + * formatted) string by drivers. The aim is to be able to handle mixed + * timezones (or timezone naive values) in the same column. + * All drivers must honour that option, and potentially fallback to the + * OGRLayer generic implementation if they cannot (which is the case for the + * Arrow, Parquet and ADBC drivers). + * When DATETIME_AS_STRING=YES, the TIMEZONE option is ignored. + *
  • *
  • GEOMETRY_METADATA_ENCODING=OGC/GEOARROW (GDAL >= 3.8). * The default is OGC, which will lead to setting * the Arrow geometry column metadata to ARROW:extension:name=ogc.wkb. @@ -2613,6 +2740,15 @@ YES.
  • * to UTC of a OGRField::Date is only done if both the timezone indicated by * OGRField::Date::TZFlag and the one at the OGRFieldDefn level (or set by * this TIMEZONE option) are not unknown. + *
  • DATETIME_AS_STRING=YES/NO. Defaults to NO. Added in GDAL 3.11. + * Whether DateTime fields should be returned as a (normally ISO-8601 + * formatted) string by drivers. The aim is to be able to handle mixed + * timezones (or timezone naive values) in the same column. + * All drivers must honour that option, and potentially fallback to the + * OGRLayer generic implementation if they cannot (which is the case for the + * Arrow, Parquet and ADBC drivers). + * When DATETIME_AS_STRING=YES, the TIMEZONE option is ignored. + *
  • *
  • GEOMETRY_METADATA_ENCODING=OGC/GEOARROW (GDAL >= 3.8). * The default is OGC, which will lead to setting * the Arrow geometry column metadata to ARROW:extension:name=ogc.wkb. diff --git a/ogr/ogrsf_frmts/generic/ogrlayerarrow.h b/ogr/ogrsf_frmts/generic/ogrlayerarrow.h index e8f5fbd489d1..5571a649cc8a 100644 --- a/ogr/ogrsf_frmts/generic/ogrlayerarrow.h +++ b/ogr/ogrsf_frmts/generic/ogrlayerarrow.h @@ -26,6 +26,9 @@ constexpr const char *EXTENSION_NAME_OGC_WKB = "ogc.wkb"; constexpr const char *EXTENSION_NAME_GEOARROW_WKB = "geoarrow.wkb"; constexpr const char *EXTENSION_NAME_ARROW_JSON = "arrow.json"; +// GetArrowStream(GAS) options +constexpr const char *GAS_OPT_DATETIME_AS_STRING = "DATETIME_AS_STRING"; + std::map CPL_DLL OGRParseArrowMetadata(const char *pabyMetadata); diff --git a/ogr/ogrsf_frmts/gpkg/ogr_geopackage.h b/ogr/ogrsf_frmts/gpkg/ogr_geopackage.h index ec7432de933c..2d1422ec0155 100644 --- a/ogr/ogrsf_frmts/gpkg/ogr_geopackage.h +++ b/ogr/ogrsf_frmts/gpkg/ogr_geopackage.h @@ -87,6 +87,7 @@ struct OGRGPKGTableLayerFillArrowArray int nCountRows = 0; bool bErrorOccurred = false; bool bMemoryLimitReached = false; + bool bDateTimeAsString = false; std::string osErrorMsg{}; OGRFeatureDefn *poFeatureDefn = nullptr; OGRGeoPackageLayer *poLayer = nullptr; diff --git a/ogr/ogrsf_frmts/gpkg/ogrgeopackagelayer.cpp b/ogr/ogrsf_frmts/gpkg/ogrgeopackagelayer.cpp index 479b717edc81..7d8265310e55 100644 --- a/ogr/ogrsf_frmts/gpkg/ogrgeopackagelayer.cpp +++ b/ogr/ogrsf_frmts/gpkg/ogrgeopackagelayer.cpp @@ -17,6 +17,7 @@ #include "ogr_p.h" #include "ogr_recordbatch.h" #include "ograrrowarrayhelper.h" +#include "ogrlayerarrow.h" /************************************************************************/ /* OGRGeoPackageLayer() */ @@ -550,6 +551,9 @@ int OGRGeoPackageLayer::GetNextArrowArray(struct ArrowArrayStream *stream, struct tm brokenDown; memset(&brokenDown, 0, sizeof(brokenDown)); + const bool bDateTimeAsString = m_aosArrowArrayStreamOptions.FetchBool( + GAS_OPT_DATETIME_AS_STRING, false); + const uint32_t nMemLimit = OGRArrowArrayHelper::GetMemLimit(); int iFeat = 0; while (iFeat < sHelper.m_nMaxBatchSize) @@ -845,15 +849,23 @@ int OGRGeoPackageLayer::GetNextArrowArray(struct ArrowArrayStream *stream, case OFTDateTime: { - OGRField ogrField; - if (ParseDateTimeField(hStmt, iRawField, nSqlite3ColType, - &ogrField, poFieldDefn, nFID)) + if (!bDateTimeAsString) + { + OGRField ogrField; + if (ParseDateTimeField(hStmt, iRawField, + nSqlite3ColType, &ogrField, + poFieldDefn, nFID)) + { + sHelper.SetDateTime(psArray, iFeat, brokenDown, + sHelper.m_anTZFlags[iField], + ogrField); + } + break; + } + else { - sHelper.SetDateTime(psArray, iFeat, brokenDown, - sHelper.m_anTZFlags[iField], - ogrField); + [[fallthrough]]; } - break; } case OFTString: diff --git a/ogr/ogrsf_frmts/gpkg/ogrgeopackagetablelayer.cpp b/ogr/ogrsf_frmts/gpkg/ogrgeopackagetablelayer.cpp index 8fc0a3d444e3..724c2a6069dd 100644 --- a/ogr/ogrsf_frmts/gpkg/ogrgeopackagetablelayer.cpp +++ b/ogr/ogrsf_frmts/gpkg/ogrgeopackagetablelayer.cpp @@ -13,6 +13,7 @@ #include "ogr_geopackage.h" #include "ogrgeopackageutility.h" +#include "ogrlayerarrow.h" #include "ogrsqliteutility.h" #include "cpl_md5.h" #include "cpl_time.h" @@ -7820,6 +7821,7 @@ void OGR_GPKG_FillArrowArray_Step(sqlite3_context *pContext, int /*argc*/, const auto nMemLimit = psFillArrowArray->nMemLimit; const int SQLITE_MAX_FUNCTION_ARG = sqlite3_limit(psFillArrowArray->hDB, SQLITE_LIMIT_FUNCTION_ARG, -1); + const bool bDateTimeAsString = psFillArrowArray->bDateTimeAsString; begin: int iFeat; OGRArrowArrayHelper *psHelper; @@ -8171,18 +8173,25 @@ void OGR_GPKG_FillArrowArray_Step(sqlite3_context *pContext, int /*argc*/, case OFTDateTime: { - OGRField ogrField; - const auto pszTxt = reinterpret_cast( - sqlite3_value_text(argv[iCol])); - if (pszTxt != nullptr && - psFillArrowArray->poLayer->ParseDateTimeField( - pszTxt, &ogrField, poFieldDefn, nFID)) + if (!bDateTimeAsString) + { + OGRField ogrField; + const auto pszTxt = reinterpret_cast( + sqlite3_value_text(argv[iCol])); + if (pszTxt != nullptr && + psFillArrowArray->poLayer->ParseDateTimeField( + pszTxt, &ogrField, poFieldDefn, nFID)) + { + psHelper->SetDateTime( + psArray, iFeat, psFillArrowArray->brokenDown, + psHelper->m_anTZFlags[iField], ogrField); + } + break; + } + else { - psHelper->SetDateTime( - psArray, iFeat, psFillArrowArray->brokenDown, - psHelper->m_anTZFlags[iField], ogrField); + [[fallthrough]]; } - break; } case OFTString: @@ -8337,6 +8346,9 @@ int OGRGeoPackageTableLayer::GetNextArrowArrayAsynchronous( m_poFillArrowArray->psHelper = std::move(psHelper); m_poFillArrowArray->nCountRows = 0; m_poFillArrowArray->bErrorOccurred = false; + m_poFillArrowArray->bDateTimeAsString = + m_aosArrowArrayStreamOptions.FetchBool(GAS_OPT_DATETIME_AS_STRING, + false); m_poFillArrowArray->poFeatureDefn = m_poFeatureDefn; m_poFillArrowArray->poLayer = this; m_poFillArrowArray->hDB = m_poDS->GetDB(); @@ -8888,6 +8900,8 @@ int OGRGeoPackageTableLayer::GetNextArrowArrayInternal( sFillArrowArray.nCountRows = 0; sFillArrowArray.bMemoryLimitReached = false; sFillArrowArray.bErrorOccurred = false; + sFillArrowArray.bDateTimeAsString = m_aosArrowArrayStreamOptions.FetchBool( + GAS_OPT_DATETIME_AS_STRING, false); sFillArrowArray.poFeatureDefn = m_poFeatureDefn; sFillArrowArray.poLayer = this; sFillArrowArray.hDB = m_poDS->GetDB();