RELEASE version 0.2.0 (#6)

Co-authored-by: LeAnhMinh <[email protected]>
pgspider · Nov 9, 2021 · 6a24715 · 6a24715
1 parent 1c93528
commit 6a24715
Show file tree

Hide file tree

Showing 97 changed files with 49,200 additions and 4,256 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,28 @@
+name: CI
+
+on:
+  push:
+    branches: ['*']
+  pull_request:
+    branches: ['*']
+
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        pg: [13, 12, 11, 10]
+        ccflags: ['']
+        include:
+          - pg: 13
+            ccflags: '-DCACHING_TEST'
+    name: PostgreSQL ${{ matrix.pg }}
+    runs-on: ubuntu-latest
+    container: zilder/pg-ext-check
+    steps:
+      - run: pg-setup ${{ matrix.pg }}
+      - uses: actions/checkout@v2
+      - run: ./install_arrow.sh
+      - run: build-check
+        env:
+          CCFLAGS: ${{ matrix.ccflags }}
diff --git a/.gitignore b/.gitignore
@@ -1,52 +1,7 @@
-# Prerequisites
-*.d
-
-# Object files
+
 *.o
-*.ko
-*.obj
-*.elf
-
-# Linker output
-*.ilk
-*.map
-*.exp
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Libraries
-*.lib
-*.a
-*.la
-*.lo
-
-# Shared objects (inc. Windows DLLs)
-*.dll
 *.so
-*.so.*
-*.dylib
-
-# Executables
-*.exe
-*.out
-*.app
-*.i*86
-*.x86_64
-*.hex
-
-# Debug files
-*.dSYM/
-*.su
-*.idb
-*.pdb
-
-# Kernel Module Compile Results
-*.mod*
-*.cmd
-.tmp_versions/
-modules.order
-Module.symvers
-Mkfile.old
-dkms.conf
+*.bc
+regression.diffs
+regression.out
+results
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 MODULE_big = parquet_s3_fdw
-OBJS = parquet_impl.o parquet_fdw.o
+OBJS = src/common.o src/reader.o src/exec_state.o src/parquet_impl.o src/parquet_fdw.o
 # Add file for S3
 OBJS += parquet_s3_fdw.o parquet_s3_fdw_connection.o parquet_s3_fdw_server_option.o
 
@@ -10,39 +10,52 @@ SHLIB_LINK = -lm -lstdc++ -lparquet -larrow
 SHLIB_LINK += -laws-cpp-sdk-core -laws-cpp-sdk-s3
 
 EXTENSION = parquet_s3_fdw
-DATA = parquet_s3_fdw--0.1.sql parquet_s3_fdw--0.1--0.2.sql
+DATA = parquet_s3_fdw--0.1.sql parquet_s3_fdw--0.1--0.2.sql parquet_s3_fdw--0.2--0.3.sql parquet_s3_fdw--0.3.sql
 
-REGRESS = parquet_fdw import parquet_s3_fdw import_s3 parquet_s3_fdw2
-
-EXTRA_CLEAN = sql/parquet_fdw.sql expected/parquet_fdw.out
-
-PG_CONFIG ?= pg_config
+REGRESS = import_local import_server parquet_s3_fdw_local parquet_s3_fdw_server parquet_s3_fdw_post_local parquet_s3_fdw_post_server parquet_s3_fdw2
 
 # parquet_impl.cpp requires C++ 11.
 override PG_CXXFLAGS += -std=c++11 -O3
 
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-
 # pass CCFLAGS (when defined) to both C and C++ compilers.
 ifdef CCFLAGS
 	override PG_CXXFLAGS += $(CCFLAGS)
 	override PG_CFLAGS += $(CCFLAGS)
 endif
 
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
 
 # XXX: PostgreSQL below 11 does not automatically add -fPIC or equivalent to C++
 # flags when building a shared library, have to do it here explicitely.
 ifeq ($(shell test $(VERSION_NUM) -lt 110000; echo $$?), 0)
 	override CXXFLAGS += $(CFLAGS_SL)
 endif
+else
+subdir = contrib/parquet_s3_fdw
+top_builddir = ../..
 
 # PostgreSQL uses link time optimization option which may break compilation
 # (this happens on travis-ci). Redefine COMPILE.cxx.bc without this option.
 COMPILE.cxx.bc = $(CLANG) -xc++ -Wno-ignored-attributes $(BITCODE_CXXFLAGS) $(CPPFLAGS) -emit-llvm -c
 
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+
 # XXX: a hurdle to use common compiler flags when building bytecode from C++
 # files. should be not unnecessary, but src/Makefile.global omits passing those
 # flags for an unnknown reason.
 %.bc : %.cpp
 	$(COMPILE.cxx.bc) $(CXXFLAGS) $(CPPFLAGS)  -o $@ $<
+endif
+
+ifdef REGRESS_PREFIX
+REGRESS_PREFIX_SUB = $(REGRESS_PREFIX)
+else
+REGRESS_PREFIX_SUB = $(VERSION)
+endif
+
+REGRESS := $(addprefix $(REGRESS_PREFIX_SUB)/,$(REGRESS))
+$(shell mkdir -p results/$(REGRESS_PREFIX_SUB))
diff --git a/README.md b/README.md
@@ -3,13 +3,12 @@
 This PostgreSQL extension is a Foreign Data Wrapper (FDW) for accessing Parquet file on local file system and [Amazon S3][2].
 This version of parquet_s3_fdw can work for PostgreSQL 13.
 
-Parquet foreign data wrapper supporting S3 access for PostgreSQL.
+Read-only Apache Parquet foreign data wrapper supporting S3 access for PostgreSQL.
 
-This code is based on [`parquet_fdw`][1]created by adjust GmbH.
 
 ## Installation
 ### 1. Install dependent libraries
-`parquet_s3_fdw` requires `libarrow` and `libparquet` installed in your system (requires version 0.15, for previous versions use branch [arrow-0.14](https://github.com/adjust/parquet_fdw/tree/arrow-0.14)). Please refer to [building guide](https://github.com/apache/arrow/blob/master/cpp/README.md).
+`parquet_s3_fdw` requires `libarrow` and `libparquet` installed in your system (requires version 0.15+, for previous versions use branch [arrow-0.14](https://github.com/adjust/parquet_fdw/tree/arrow-0.14)). Please refer to [building guide](https://github.com/apache/arrow/blob/master/docs/source/developers/cpp/building.rst).
 
 `AWS SDK for C++ (libaws-cpp-sdk-core libaws-cpp-sdk-s3)` is also required (Confirmed version is 1.8.14).
 
@@ -51,8 +50,9 @@ CREATE USER MAPPING FOR public SERVER parquet_s3_srv OPTIONS (user 's3user', pas
 ### Create foreign table
 Now you should be able to create foreign table from Parquet files. Currently `parquet_s3_fdw` supports the following column [types](https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h) (to be extended shortly):
 
-| Parquet type |  SQL type |
-|--------------|-----------|
+|   Arrow type |  SQL type |
+|-------------:|----------:|
+|        INT16 |      INT2 |
 |        INT32 |      INT4 |
 |        INT64 |      INT8 |
 |        FLOAT |    FLOAT4 |
@@ -62,6 +62,7 @@ Now you should be able to create foreign table from Parquet files. Currently `pa
 |       STRING |      TEXT |
 |       BINARY |     BYTEA |
 |         LIST |     ARRAY |
+|          MAP |     JSONB |
 
 Currently `parquet_s3_fdw` doesn't support structs and nested lists.
 
@@ -70,7 +71,19 @@ Following options are supported:
 * **dirname** - path to directory having Parquet files to read;
 * **sorted** - space separated list of columns that Parquet files are presorted by; that would help postgres to avoid redundant sorting when running query with `ORDER BY` clause or in other cases when having a presorted set is beneficial (Group Aggregate, Merge Join);
 * **use_mmap** - whether memory map operations will be used instead of file read operations (default `false`);
-* **use_threads** - enables `arrow`'s parallel columns decoding/decompression (default `false`).
+* **use_threads** - enables Apache Arrow's parallel columns decoding/decompression (default `false`);
+* **files_func** - user defined function that is used by parquet_s3_fdw to retrieve the list of parquet files on each query; function must take one `JSONB` argument and return text array of full paths to parquet files;
+* **files_func_arg** - argument for the function, specified by **files_func**.
+* **max_open_files** - the limit for the number of Parquet files open simultaneously.
+
+Foreign table may be created for a single Parquet file and for a set of files. It is also possible to specify a user defined function, which would return a list of file paths. Depending on the number of files and table options `parquet_s3_fdw` may use one of the following execution strategies:
+
+| Strategy                | Description              |
+|-------------------------|--------------------------|
+| **Single File**         | Basic single file reader
+| **Multifile**           | Reader which process Parquet files one by one in sequential manner |
+| **Multifile Merge**     | Reader which merges presorted Parquet files so that the produced result is also ordered; used when `sorted` option is specified and the query plan implies ordering (e.g. contains `ORDER BY` clause) |
+| **Caching Multifile Merge** | Same as `Multifile Merge`, but keeps the number of simultaneously open files limited; used when the number of specified Parquet files exceeds `max_open_files` |
 
 GUC variables:
 * **parquet_fdw.use_threads** - global switch that allow user to enable or disable threads (default `true`).
@@ -94,7 +107,7 @@ SELECT * FROM userdata;
 ```
 
 ## Parallel queries
-`parquet_s3_fdw` also supports [parallel query execution](https://www.postgresql.org/docs/current/parallel-query.html) (not to confuse with multi-threaded decoding feature of `arrow`). It is disabled by default; to enable it run `ANALYZE` command on the table. The reason behind this is that without statistics postgres may end up choosing a terrible parallel plan for certain queries which would be much worse than a serial one (e.g. grouping by a column with large number of distinct values).
+`parquet_s3_fdw` also supports [parallel query execution](https://www.postgresql.org/docs/current/parallel-query.html) (not to confuse with multi-threaded decoding feature of Apache Arrow). It is disabled by default; to enable it run `ANALYZE` command on the table. The reason behind this is that without statistics postgres may end up choosing a terrible parallel plan for certain queries which would be much worse than a serial one (e.g. grouping by a column with large number of distinct values).
 
 ## Import
 `parquet_s3_fdw` also supports [`IMPORT FOREIGN SCHEMA`](https://www.postgresql.org/docs/current/sql-importforeignschema.html) command to discover parquet files in the specified directory on filesystem and create foreign tables according to those files. It can be used as follows:
@@ -160,12 +173,25 @@ SELECT import_parquet_s3_explicit(
 ## Features
 - Support SELECT of parquet file on local file system or Amazon S3.
 - Support MinIO access instead of Amazon S3.
+- Allow control over whether foreign servers keep connections open after transaction completion. This is controlled by keep_connections and defaults to on.
+- Support parquet_s3_fdw function parquet_s3_fdw_get_connections() to report open foreign server connections.
 
 ## Limitations
 - Modification (INSERT, UPDATE and DELETE) is not supported.
 - Transaction is not supported.
 - Cannot create a single foreign table using parquet files on both file system and Amazon S3.
 - AWS region is hard-coded as "ap-northeast-1". If you want to use another region, you need to modify the source code by changing "AP_NORTHEAST_1" in parquet_s3_fdw_connection.cpp.
+- For the query that return record type, parquet s3 fdw only fills data for columns which are refered in target list or clause. For other columns, they are filled as NULL.     
+Example:    
+    ```sql
+    -- column c1 and c3 are refered in ORDER BY clause, so it will be filled with values. For other columns: c2,c4,c5,c6 filled as NULL.
+    SELECT t1 FROM tbl t1 ORDER BY tbl.c3, tbl.c1;     
+            t1              
+    ------------------      
+     (101,,00101,,,,)       
+     (102,,00102,,,,)       
+    (2 rows) 
+    ```  
 
 ## Contributing
 Opening issues and pull requests on GitHub are welcome.

diff --git a/data/README.md b/data/README.md
@@ -1,6 +1,6 @@
 # Sample Parquet data
 
-`example.parquet` schema:
+`simple/example1.parquet` and `simple/example2.parquet` schema:
 
 | column |        type |
 |--------|-------------|
@@ -10,6 +10,15 @@
 |   four |   TIMESTAMP |
 |   five |      DATE32 |
 |    six |        BOOL |
+|  seven |      DOUBLE |
+
+`complex/example3.parquet`  schema:
+
+| column |               type |
+|--------|--------------------|
+|    one | MAP<INT32, STRING> |
+|    two | MAP<DATE32, INT16> |
+|  three |             STRING |
 
 ## Generator
 

diff --git a/data/complex/example3.parquet b/data/complex/example3.parquet
diff --git a/data/example1.parquet b/data/example1.parquet
diff --git a/data/example2.parquet b/data/example2.parquet
diff --git a/data/generate.py b/data/generate.py
@@ -7,7 +7,7 @@
 import pyarrow.parquet as pq
 from datetime import datetime, date
 
-# row group 1
+# example1.parquet file
 df1 = pd.DataFrame({'one': [1, 2, 3],
                     'two': [[1, 2, 3], [None, 5, 6], [7, 8, 9]],
                     'three': ['foo', 'bar', 'baz'],
@@ -21,7 +21,6 @@
                     'seven': [0.5, None, 1.0]})
 table1 = pa.Table.from_pandas(df1)
 
-# row group 2
 df2 = pd.DataFrame({'one': [4, 5, 6],
                     'two': [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
                     'three': ['uno', 'dos', 'tres'],
@@ -32,10 +31,10 @@
                              date(2018, 1, 5),
                              date(2018, 1, 6)],
                     'six': [False, False, False],
-                    'seven': [0.5, None, 1.0]})
+                    'seven': [1.5, None, 2.0]})
 table2 = pa.Table.from_pandas(df2)
 
-with pq.ParquetWriter('example1.parquet', table1.schema) as writer:
+with pq.ParquetWriter('simple/example1.parquet', table1.schema) as writer:
     writer.write_table(table1)
     writer.write_table(table2)
 
@@ -56,5 +55,30 @@
                     'six': [True, False, True, False, True]})
 table3 = pa.Table.from_pandas(df3)
 
-with pq.ParquetWriter('example2.parquet', table3.schema) as writer:
+with pq.ParquetWriter('simple/example2.parquet', table3.schema) as writer:
     writer.write_table(table3)
+
+# example3.parquet file
+mdt1 = pa.map_(pa.int32(), pa.string())
+mdt2 = pa.map_(pa.date32(), pa.int16())
+df = pd.DataFrame({
+        'one': pd.Series([
+            [(1, 'foo'), (2, 'bar'), (3, 'baz')],
+            [(4, 'test1'), (5,'test2')],
+        ]),
+        'two': pd.Series([
+            [(date(2018, 1, 1), 10), (date(2018, 1, 2), 15)],
+            [(date(2018, 1, 3), 20), (date(2018, 1, 4), 25)],
+        ]),
+        'three': pd.Series([1, 2]),
+    }
+)
+
+schema = pa.schema([
+    pa.field('one', mdt1),
+    pa.field('two', mdt2),
+    pa.field('three', pa.int32())])
+table = pa.Table.from_pandas(df, schema)
+
+with pq.ParquetWriter('complex/example3.parquet', table.schema) as writer:
+    writer.write_table(table)
diff --git a/data/ported_postgres/T0.parquet b/data/ported_postgres/T0.parquet
diff --git a/data/ported_postgres/T1.parquet b/data/ported_postgres/T1.parquet
diff --git a/data/ported_postgres/T2.parquet b/data/ported_postgres/T2.parquet
diff --git a/data/ported_postgres/T3.parquet b/data/ported_postgres/T3.parquet
diff --git a/data/ported_postgres/T4.parquet b/data/ported_postgres/T4.parquet
diff --git a/data/ported_postgres/base_tbl.parquet b/data/ported_postgres/base_tbl.parquet
diff --git a/data/ported_postgres/child_tbl.parquet b/data/ported_postgres/child_tbl.parquet
diff --git a/data/ported_postgres/ft1.parquet b/data/ported_postgres/ft1.parquet
diff --git a/data/ported_postgres/ft1_null.parquet b/data/ported_postgres/ft1_null.parquet
diff --git a/data/ported_postgres/loc1.parquet b/data/ported_postgres/loc1.parquet
diff --git a/data/ported_postgres/local_tbl.parquet b/data/ported_postgres/local_tbl.parquet
diff --git a/data/ported_postgres/loct3.parquet b/data/ported_postgres/loct3.parquet
diff --git a/data/ported_postgres/loct_empty.parquet b/data/ported_postgres/loct_empty.parquet
diff --git a/data/simple/example1.parquet b/data/simple/example1.parquet
diff --git a/data/simple/example2.parquet b/data/simple/example2.parquet
diff --git a/...ta/test-bucket/dir1/dir11/file111.parquet → data/test-bucket/dir1/dir11/file111.parquet b/...ta/test-bucket/dir1/dir11/file111.parquet → data/test-bucket/dir1/dir11/file111.parquet
diff --git a/...ta/test-bucket/dir1/dir11/file112.parquet → data/test-bucket/dir1/dir11/file112.parquet b/...ta/test-bucket/dir1/dir11/file112.parquet → data/test-bucket/dir1/dir11/file112.parquet
diff --git a/...ta/test-bucket/dir1/dir12/file121.parquet → data/test-bucket/dir1/dir12/file121.parquet b/...ta/test-bucket/dir1/dir12/file121.parquet → data/test-bucket/dir1/dir12/file121.parquet
diff --git a/testdata/test-bucket/dir1/file1.parquet → data/test-bucket/dir1/file1.parquet b/testdata/test-bucket/dir1/file1.parquet → data/test-bucket/dir1/file1.parquet
diff --git a/testdata/test-bucket/dir2/file21.parquet → data/test-bucket/dir2/file21.parquet b/testdata/test-bucket/dir2/file21.parquet → data/test-bucket/dir2/file21.parquet
diff --git a/testdata/test-bucket/dir2/file22.parquet → data/test-bucket/dir2/file22.parquet b/testdata/test-bucket/dir2/file22.parquet → data/test-bucket/dir2/file22.parquet
diff --git a/testdata/test-bucket/dir2/file23.parquet → data/test-bucket/dir2/file23.parquet b/testdata/test-bucket/dir2/file23.parquet → data/test-bucket/dir2/file23.parquet
diff --git a/testdata/test-bucket/file0.parquet → data/test-bucket/file0.parquet b/testdata/test-bucket/file0.parquet → data/test-bucket/file0.parquet
diff --git a/expected/.gitignore b/expected/.gitignore