From ee478eded04ef66ea2a2ceea2f71f782555ce9b6 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Thu, 2 Jan 2025 11:31:07 -0500 Subject: [PATCH 01/43] Make Beam expansion service shadowJar multi-release so it processes multi-release dependencies correctly (#33472) --- sdks/java/extensions/sql/expansion-service/build.gradle | 3 +++ sdks/java/io/expansion-service/build.gradle | 3 +++ .../io/google-cloud-platform/expansion-service/build.gradle | 3 +++ 3 files changed, 9 insertions(+) diff --git a/sdks/java/extensions/sql/expansion-service/build.gradle b/sdks/java/extensions/sql/expansion-service/build.gradle index b8d78e4e1bb9..024041e40b36 100644 --- a/sdks/java/extensions/sql/expansion-service/build.gradle +++ b/sdks/java/extensions/sql/expansion-service/build.gradle @@ -48,5 +48,8 @@ task runExpansionService (type: JavaExec) { } shadowJar { + manifest { + attributes(["Multi-Release": true]) + } outputs.upToDateWhen { false } } \ No newline at end of file diff --git a/sdks/java/io/expansion-service/build.gradle b/sdks/java/io/expansion-service/build.gradle index a27a66b1f3dc..38bee450e752 100644 --- a/sdks/java/io/expansion-service/build.gradle +++ b/sdks/java/io/expansion-service/build.gradle @@ -34,6 +34,9 @@ configurations.runtimeClasspath { } shadowJar { + manifest { + attributes(["Multi-Release": true]) + } mergeServiceFiles() outputs.upToDateWhen { false } } diff --git a/sdks/java/io/google-cloud-platform/expansion-service/build.gradle b/sdks/java/io/google-cloud-platform/expansion-service/build.gradle index 01181721e9a4..b5ce11853f6c 100644 --- a/sdks/java/io/google-cloud-platform/expansion-service/build.gradle +++ b/sdks/java/io/google-cloud-platform/expansion-service/build.gradle @@ -49,5 +49,8 @@ task runExpansionService (type: JavaExec) { } shadowJar { + manifest { + attributes(["Multi-Release": true]) + } outputs.upToDateWhen { false } } \ No newline at end of file From dfb90ed46684915724e2f99c8f085d10b5a3e03e Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Thu, 2 Jan 2025 12:01:26 -0500 Subject: [PATCH 02/43] Revert "Update Java expansion service to use distroless (#33464)" (#33476) This reverts commit 669076afd93a9b937a8b8b2e8537f1e0e1375bff. --- sdks/java/expansion-service/container/Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sdks/java/expansion-service/container/Dockerfile b/sdks/java/expansion-service/container/Dockerfile index 26ff5d0317bb..1b83ec68b994 100644 --- a/sdks/java/expansion-service/container/Dockerfile +++ b/sdks/java/expansion-service/container/Dockerfile @@ -16,7 +16,7 @@ # limitations under the License. ############################################################################### -FROM gcr.io/distroless/java:11 +FROM eclipse-temurin:11 LABEL Author "Apache Beam " ARG TARGETOS ARG TARGETARCH @@ -39,8 +39,7 @@ COPY target/launcher/${TARGETOS}_${TARGETARCH}/boot /opt/apache/beam/ # Copy the config file COPY target/expansion_service_config.yml ./ -# Add golang licenses. These commands require having a shell to work with -COPY --from=busybox:1.35.0-uclibc /bin/sh /bin/sh +# Add golang licenses. COPY target/go-licenses/* /opt/apache/beam/third_party_licenses/golang/ RUN if [ "${pull_licenses}" = "false" ] ; then \ # Remove above license dir if pull licenses false From 8f2bf2292501a4fe33fd663c503bd19de8318ec0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Jan 2025 09:18:44 -0800 Subject: [PATCH 03/43] Bump github.com/aws/aws-sdk-go-v2/config from 1.28.6 to 1.28.7 in /sdks (#33440) Bumps [github.com/aws/aws-sdk-go-v2/config](https://github.com/aws/aws-sdk-go-v2) from 1.28.6 to 1.28.7. - [Release notes](https://github.com/aws/aws-sdk-go-v2/releases) - [Commits](https://github.com/aws/aws-sdk-go-v2/compare/config/v1.28.6...config/v1.28.7) --- updated-dependencies: - dependency-name: github.com/aws/aws-sdk-go-v2/config dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 20 ++++++++++---------- sdks/go.sum | 40 ++++++++++++++++++++-------------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 31dcd9f53456..f96d4f7fd1bf 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -30,9 +30,9 @@ require ( cloud.google.com/go/pubsub v1.45.3 cloud.google.com/go/spanner v1.73.0 cloud.google.com/go/storage v1.48.0 - github.com/aws/aws-sdk-go-v2 v1.32.6 - github.com/aws/aws-sdk-go-v2/config v1.28.6 - github.com/aws/aws-sdk-go-v2/credentials v1.17.47 + github.com/aws/aws-sdk-go-v2 v1.32.7 + github.com/aws/aws-sdk-go-v2/config v1.28.7 + github.com/aws/aws-sdk-go-v2/credentials v1.17.48 github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.43 github.com/aws/aws-sdk-go-v2/service/s3 v1.71.0 github.com/aws/smithy-go v1.22.1 @@ -132,18 +132,18 @@ require ( github.com/apache/thrift v0.17.0 // indirect github.com/aws/aws-sdk-go v1.34.0 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.7 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.21 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.25 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.25 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.22 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.26 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.26 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 // indirect github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.25 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.1 // indirect github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.6 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.6 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.7 // indirect github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.6 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.24.7 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.6 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.33.2 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.24.8 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.7 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.33.3 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index fde28c9e6be5..72658b7e74dc 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -689,26 +689,26 @@ github.com/aws/aws-sdk-go v1.30.19/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZve github.com/aws/aws-sdk-go v1.34.0 h1:brux2dRrlwCF5JhTL7MUT3WUwo9zfDHZZp3+g3Mvlmo= github.com/aws/aws-sdk-go v1.34.0/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= github.com/aws/aws-sdk-go-v2 v1.7.1/go.mod h1:L5LuPC1ZgDr2xQS7AmIec/Jlc7O/Y1u2KxJyNVab250= -github.com/aws/aws-sdk-go-v2 v1.32.6 h1:7BokKRgRPuGmKkFMhEg/jSul+tB9VvXhcViILtfG8b4= -github.com/aws/aws-sdk-go-v2 v1.32.6/go.mod h1:P5WJBrYqqbWVaOxgH0X/FYYD47/nooaPOZPlQdmiN2U= +github.com/aws/aws-sdk-go-v2 v1.32.7 h1:ky5o35oENWi0JYWUZkB7WYvVPP+bcRF5/Iq7JWSb5Rw= +github.com/aws/aws-sdk-go-v2 v1.32.7/go.mod h1:P5WJBrYqqbWVaOxgH0X/FYYD47/nooaPOZPlQdmiN2U= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.7 h1:lL7IfaFzngfx0ZwUGOZdsFFnQ5uLvR0hWqqhyE7Q9M8= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.7/go.mod h1:QraP0UcVlQJsmHfioCrveWOC1nbiWUl3ej08h4mXWoc= github.com/aws/aws-sdk-go-v2/config v1.5.0/go.mod h1:RWlPOAW3E3tbtNAqTwvSW54Of/yP3oiZXMI0xfUdjyA= -github.com/aws/aws-sdk-go-v2/config v1.28.6 h1:D89IKtGrs/I3QXOLNTH93NJYtDhm8SYa9Q5CsPShmyo= -github.com/aws/aws-sdk-go-v2/config v1.28.6/go.mod h1:GDzxJ5wyyFSCoLkS+UhGB0dArhb9mI+Co4dHtoTxbko= +github.com/aws/aws-sdk-go-v2/config v1.28.7 h1:GduUnoTXlhkgnxTD93g1nv4tVPILbdNQOzav+Wpg7AE= +github.com/aws/aws-sdk-go-v2/config v1.28.7/go.mod h1:vZGX6GVkIE8uECSUHB6MWAUsd4ZcG2Yq/dMa4refR3M= github.com/aws/aws-sdk-go-v2/credentials v1.3.1/go.mod h1:r0n73xwsIVagq8RsxmZbGSRQFj9As3je72C2WzUIToc= -github.com/aws/aws-sdk-go-v2/credentials v1.17.47 h1:48bA+3/fCdi2yAwVt+3COvmatZ6jUDNkDTIsqDiMUdw= -github.com/aws/aws-sdk-go-v2/credentials v1.17.47/go.mod h1:+KdckOejLW3Ks3b0E3b5rHsr2f9yuORBum0WPnE5o5w= +github.com/aws/aws-sdk-go-v2/credentials v1.17.48 h1:IYdLD1qTJ0zanRavulofmqut4afs45mOWEI+MzZtTfQ= +github.com/aws/aws-sdk-go-v2/credentials v1.17.48/go.mod h1:tOscxHN3CGmuX9idQ3+qbkzrjVIx32lqDSU1/0d/qXs= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.3.0/go.mod h1:2LAuqPx1I6jNfaGDucWfA2zqQCYCOMCDHiCOciALyNw= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.21 h1:AmoU1pziydclFT/xRV+xXE/Vb8fttJCLRPv8oAkprc0= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.21/go.mod h1:AjUdLYe4Tgs6kpH4Bv7uMZo7pottoyHMn4eTcIcneaY= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.22 h1:kqOrpojG71DxJm/KDPO+Z/y1phm1JlC8/iT+5XRmAn8= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.22/go.mod h1:NtSFajXVVL8TA2QNngagVZmUtXciyrHOt7xgz4faS/M= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.3.2/go.mod h1:qaqQiHSrOUVOfKe6fhgQ6UzhxjwqVW8aHNegd6Ws4w4= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.43 h1:iLdpkYZ4cXIQMO7ud+cqMWR1xK5ESbt1rvN77tRi1BY= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.43/go.mod h1:OgbsKPAswXDd5kxnR4vZov69p3oYjbvUyIRBAAV0y9o= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.25 h1:s/fF4+yDQDoElYhfIVvSNyeCydfbuTKzhxSXDXCPasU= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.25/go.mod h1:IgPfDv5jqFIzQSNbUEMoitNooSMXjRSDkhXv8jiROvU= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.25 h1:ZntTCl5EsYnhN/IygQEUugpdwbhdkom9uHcbCftiGgA= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.25/go.mod h1:DBdPrgeocww+CSl1C8cEV8PN1mHMBhuCDLpXezyvWkE= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.26 h1:I/5wmGMffY4happ8NOCuIUEWGUvvFp5NSeQcXl9RHcI= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.26/go.mod h1:FR8f4turZtNy6baO0KJ5FJUmXH/cSkI9fOngs0yl6mA= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.26 h1:zXFLuEuMMUOvEARXFUVJdfqZ4bvvSgdGRq/ATcrQxzM= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.26/go.mod h1:3o2Wpy0bogG1kyOPrgkXA8pgIfEEv0+m19O9D5+W8y8= github.com/aws/aws-sdk-go-v2/internal/ini v1.1.1/go.mod h1:Zy8smImhTdOETZqfyn01iNOe0CNggVbPjCajyaz6Gvg= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 h1:VaRN3TlFdd6KxX1x3ILT5ynH6HvKgqdiXoTxAF4HQcQ= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1/go.mod h1:FbtygfRFze9usAadmnGJNc8KsP346kEe+y2/oyhGAGc= @@ -720,8 +720,8 @@ github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.1/go.mod h1: github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.6 h1:HCpPsWqmYQieU7SS6E9HXfdAMSud0pteVXieJmcpIRI= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.6/go.mod h1:ngUiVRCco++u+soRRVBIvBZxSMMvOVMXA4PJ36JLfSw= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.2.1/go.mod h1:zceowr5Z1Nh2WVP8bf/3ikB41IZW59E4yIYbg+pC6mw= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.6 h1:50+XsN70RS7dwJ2CkVNXzj7U2L1HKP8nqTd3XWEXBN4= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.6/go.mod h1:WqgLmwY7so32kG01zD8CPTJWVWM+TzJoOVHwTg4aPug= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.7 h1:8eUsivBQzZHqe/3FE+cqwfH+0p5Jo8PFM/QYQSmeZ+M= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.7/go.mod h1:kLPQvGUmxn/fqiCrDeohwG33bq2pQpGeY62yRO6Nrh0= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.5.1/go.mod h1:6EQZIwNNvHpq/2/QSJnp4+ECvqIy55w95Ofs0ze+nGQ= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.6 h1:BbGDtTi0T1DYlmjBiCr/le3wzhA37O8QTC5/Ab8+EXk= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.6/go.mod h1:hLMJt7Q8ePgViKupeymbqI0la+t9/iYFBjxQCFwuAwI= @@ -729,13 +729,13 @@ github.com/aws/aws-sdk-go-v2/service/s3 v1.11.1/go.mod h1:XLAGFrEjbvMCLvAtWLLP32 github.com/aws/aws-sdk-go-v2/service/s3 v1.71.0 h1:nyuzXooUNJexRT0Oy0UQY6AhOzxPxhtt4DcBIHyCnmw= github.com/aws/aws-sdk-go-v2/service/s3 v1.71.0/go.mod h1:sT/iQz8JK3u/5gZkT+Hmr7GzVZehUMkRZpOaAwYXeGY= github.com/aws/aws-sdk-go-v2/service/sso v1.3.1/go.mod h1:J3A3RGUvuCZjvSuZEcOpHDnzZP/sKbhDWV2T1EOzFIM= -github.com/aws/aws-sdk-go-v2/service/sso v1.24.7 h1:rLnYAfXQ3YAccocshIH5mzNNwZBkBo+bP6EhIxak6Hw= -github.com/aws/aws-sdk-go-v2/service/sso v1.24.7/go.mod h1:ZHtuQJ6t9A/+YDuxOLnbryAmITtr8UysSny3qcyvJTc= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.6 h1:JnhTZR3PiYDNKlXy50/pNeix9aGMo6lLpXwJ1mw8MD4= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.6/go.mod h1:URronUEGfXZN1VpdktPSD1EkAL9mfrV+2F4sjH38qOY= +github.com/aws/aws-sdk-go-v2/service/sso v1.24.8 h1:CvuUmnXI7ebaUAhbJcDy9YQx8wHR69eZ9I7q5hszt/g= +github.com/aws/aws-sdk-go-v2/service/sso v1.24.8/go.mod h1:XDeGv1opzwm8ubxddF0cgqkZWsyOtw4lr6dxwmb6YQg= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.7 h1:F2rBfNAL5UyswqoeWv9zs74N/NanhK16ydHW1pahX6E= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.7/go.mod h1:JfyQ0g2JG8+Krq0EuZNnRwX0mU0HrwY/tG6JNfcqh4k= github.com/aws/aws-sdk-go-v2/service/sts v1.6.0/go.mod h1:q7o0j7d7HrJk/vr9uUt3BVRASvcU7gYZB9PUgPiByXg= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.2 h1:s4074ZO1Hk8qv65GqNXqDjmkf4HSQqJukaLuuW0TpDA= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.2/go.mod h1:mVggCnIWoM09jP71Wh+ea7+5gAp53q+49wDFs1SW5z8= +github.com/aws/aws-sdk-go-v2/service/sts v1.33.3 h1:Xgv/hyNgvLda/M9l9qxXc4UFSgppnRczLxlMs5Ae/QY= +github.com/aws/aws-sdk-go-v2/service/sts v1.33.3/go.mod h1:5Gn+d+VaaRgsjewpMvGazt0WfcFO+Md4wLOuBfGR9Bc= github.com/aws/smithy-go v1.6.0/go.mod h1:SObp3lf9smib00L/v3U2eAKG8FyQ7iLrJnQiAmR5n+E= github.com/aws/smithy-go v1.22.1 h1:/HPHZQ0g7f4eUeK6HKglFz8uwVfZKgoI25rb/J+dnro= github.com/aws/smithy-go v1.22.1/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg= From 0a91a3937a627f0ef48a1292df9767c3fd1e0e3d Mon Sep 17 00:00:00 2001 From: Shingo Furuyama Date: Thu, 2 Jan 2025 18:59:16 +0000 Subject: [PATCH 04/43] fix error on local-env-setup.sh (#33461) * fix error on local-env-setup.sh * Update local-env-setup.sh --- .gitignore | 2 ++ local-env-setup.sh | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 19778831888d..2bad81975ba0 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ sdks/**/vendor/**/* runners/**/vendor/**/* **/.gradletasknamecache **/generated/* +/go.mod +/go.sum # Ignore sources generated into the main tree **/src/main/generated/** diff --git a/local-env-setup.sh b/local-env-setup.sh index ba30813b2bcc..b75cf14f22c4 100755 --- a/local-env-setup.sh +++ b/local-env-setup.sh @@ -24,7 +24,7 @@ darwin_install_pip3_packages() { install_go_packages(){ echo "Installing goavro" - go get github.com/linkedin/goavro/v2 + go mod init beam-runtime && go get github.com/linkedin/goavro/v2 # As we are using bash, we are assuming .bashrc exists. grep -qxF "export GOPATH=${PWD}/sdks/go/examples/.gogradle/project_gopath" ~/.bashrc gopathExists=$? From f32724be8eda571fa0ced2dcce35d8737204a22e Mon Sep 17 00:00:00 2001 From: Damon Date: Thu, 2 Jan 2025 11:18:41 -0800 Subject: [PATCH 05/43] Stage beam_Publish_Python_SDK_Distroless_Snapshots.yml (#33470) * Create beam_Publish_Java_SDK_Distroless_Snapshots.yml * Rename beam_Publish_Java_SDK_Distroless_Snapshots.yml to beam_Publish_Python_SDK_Distroless_Snapshots.yml --- ...ublish_Python_SDK_Distroless_Snapshots.yml | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 .github/workflows/beam_Publish_Python_SDK_Distroless_Snapshots.yml diff --git a/.github/workflows/beam_Publish_Python_SDK_Distroless_Snapshots.yml b/.github/workflows/beam_Publish_Python_SDK_Distroless_Snapshots.yml new file mode 100644 index 000000000000..9ae37712044a --- /dev/null +++ b/.github/workflows/beam_Publish_Python_SDK_Distroless_Snapshots.yml @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Publish Beam Python SDK Distroless Snapshots + +on: + schedule: + - cron: '45 */8 * * *' + workflow_dispatch: + + #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.sender.login }}' + cancel-in-progress: true + +env: + DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + docker_registry: gcr.io + +jobs: + Python_SDK_Distroless_Snapshots: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 160 + name: ${{ matrix.job_name }} (${{ matrix.python_version }}) + strategy: + fail-fast: false + matrix: + job_name: ["Python_SDK_Distroless_Snapshots"] + job_phrase: ["N/A"] + python_version: + - "python3.9" + - "python3.10" + - "python3.11" + - "python3.12" + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.python_version }}) + - name: Find Beam Version + # We extract the Beam version here and tag the containers with it. Version will be in the form "2.xx.y.dev". + # This is needed to run pipelines that use the default environment at HEAD, for example, when a + # pipeline uses an expansion service built from HEAD. + run: | + BEAM_VERSION_LINE=$(cat gradle.properties | grep "sdk_version") + echo "BEAM_VERSION=${BEAM_VERSION_LINE#*sdk_version=}" >> $GITHUB_ENV + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + - name: GCloud Docker credential helper + run: | + gcloud auth configure-docker ${{ env.docker_registry }} + # TODO(https://github.com/apache/beam/issues/32914): create after merging into main branch + # - name: Build and push Python distroless image + + + From 73575e47a843dcfb8826cbc9a5d2acaf5e51c470 Mon Sep 17 00:00:00 2001 From: Damon Date: Thu, 2 Jan 2025 11:18:50 -0800 Subject: [PATCH 06/43] Stage beam_Publish_Java_SDK_Distroless_Snapshots.yml (#33469) * Create beam_Publish_Java_SDK_Distroless_Snapshots.yml * Update beam_Publish_Java_SDK_Distroless_Snapshots.yml * Remove extra space * Update name of workflow * Update beam_Publish_Java_SDK_Distroless_Snapshots.yml * Delay schedule past non-distroless --- ..._Publish_Java_SDK_Distroless_Snapshots.yml | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 .github/workflows/beam_Publish_Java_SDK_Distroless_Snapshots.yml diff --git a/.github/workflows/beam_Publish_Java_SDK_Distroless_Snapshots.yml b/.github/workflows/beam_Publish_Java_SDK_Distroless_Snapshots.yml new file mode 100644 index 000000000000..944648fc919b --- /dev/null +++ b/.github/workflows/beam_Publish_Java_SDK_Distroless_Snapshots.yml @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Publish Beam Java SDK Distroless Snapshots + +on: + schedule: + - cron: '45 */8 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.sender.login }}' + cancel-in-progress: true + +env: + DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + docker_registry: gcr.io + +jobs: + Java_SDK_Distroless_Snapshots: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 160 + name: ${{ matrix.job_name }} (${{ matrix.java_version }}) + strategy: + fail-fast: false + matrix: + job_name: ["Java_SDK_Distroless_Snapshots"] + job_phrase: ["N/A"] + java_version: + - "java17" + - "java21" + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.java_version }}) + - name: Find Beam Version + # We extract the Beam version here and tag the containers with it. Version will be in the form "2.xx.y.dev". + # This is needed to run pipelines that use the default environment at HEAD, for example, when a + # pipeline uses an expansion service built from HEAD. + run: | + BEAM_VERSION_LINE=$(cat gradle.properties | grep "sdk_version") + echo "BEAM_VERSION=${BEAM_VERSION_LINE#*sdk_version=}" >> $GITHUB_ENV + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + - name: GCloud Docker credential helper + run: | + gcloud auth configure-docker ${{ env.docker_registry }} + # TODO(https://github.com/apache/beam/issues/33201): create after merging into main branch + # - name: Build and push Java distroless image From badbce07b10854ff47c49a9a64265e2299af3032 Mon Sep 17 00:00:00 2001 From: claudevdm <33973061+claudevdm@users.noreply.github.com> Date: Thu, 2 Jan 2025 14:23:29 -0500 Subject: [PATCH 07/43] Update BEAM_DEV_SDK_CONTAINER_TAG for deps update. (#33480) Co-authored-by: Claude --- sdks/python/apache_beam/runners/dataflow/internal/names.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/runners/dataflow/internal/names.py b/sdks/python/apache_beam/runners/dataflow/internal/names.py index ac575e82717e..65bedf39fb2c 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/names.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/names.py @@ -34,6 +34,6 @@ # Unreleased sdks use container image tag specified below. # Update this tag whenever there is a change that # requires changes to SDK harness container or SDK harness launcher. -BEAM_DEV_SDK_CONTAINER_TAG = 'beam-master-20241118' +BEAM_DEV_SDK_CONTAINER_TAG = 'beam-master-20250102' DATAFLOW_CONTAINER_IMAGE_REPOSITORY = 'gcr.io/cloud-dataflow/v1beta3' From f3972c7c6d12e8cdaed1d7d2b489565e00860fff Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Thu, 2 Jan 2025 15:34:17 -0500 Subject: [PATCH 08/43] Add descriptive tags to republished containers (#33477) * Add descriptive tags to republished containers * Simplify defaults --- .../republish_released_docker_containers.yml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/republish_released_docker_containers.yml b/.github/workflows/republish_released_docker_containers.yml index ed6e74ecf13d..3c1ff02417cb 100644 --- a/.github/workflows/republish_released_docker_containers.yml +++ b/.github/workflows/republish_released_docker_containers.yml @@ -25,17 +25,17 @@ on: RELEASE: description: Beam version of current release (e.g. 2.XX.0) required: true - default: '' + default: '2.61.0' RC: description: Integer RC version for the release (e.g. 3 for RC3) required: true - default: '' + default: '3' schedule: - cron: "0 6 * * 1" env: docker_registry: gcr.io - release: ${{ github.event.inputs.RELEASE || "2.61.0" }} - rc: ${{ github.event.inputs.RC || "3" }} + release: "${{ github.event.inputs.RELEASE }}" + rc: "${{ github.event.inputs.RC}}" jobs: @@ -69,5 +69,13 @@ jobs: run: | gcloud auth configure-docker ${{ env.docker_registry }} - name: Push docker images - run: ./gradlew :pushAllDockerImages -PisRelease -Pdocker-pull-licenses -Pprune-images -Pdocker-repository-root=gcr.io/apache-beam-testing/updated_released_container_images -Pdocker-tag=${{ env.release }}rc${{ env.rc }} --no-daemon --no-parallel + run: | + ./gradlew :pushAllDockerImages \ + -PisRelease \ + -Pdocker-pull-licenses \ + -Pprune-images \ + -Pdocker-repository-root=gcr.io/apache-beam-testing/updated_released_container_images \ + -Pdocker-tag-list=${{ env.release }},${{ github.sha }},$(date +'%Y-%m-%d') \ + --no-daemon \ + --no-parallel From 2fb01be5b6e6ae149395ea69091b30b491a3b61d Mon Sep 17 00:00:00 2001 From: Rohit Sinha Date: Thu, 2 Jan 2025 12:13:25 -0800 Subject: [PATCH 09/43] Fix bounded trie merge when merging on empty trie --- sdks/python/apache_beam/metrics/cells.py | 2 +- sdks/python/apache_beam/metrics/cells_test.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/metrics/cells.py b/sdks/python/apache_beam/metrics/cells.py index 9a62cae14691..4da1bb8c2023 100644 --- a/sdks/python/apache_beam/metrics/cells.py +++ b/sdks/python/apache_beam/metrics/cells.py @@ -740,7 +740,7 @@ def merge(self, other: '_BoundedTrieNode') -> int: delta = 0 elif not self._children: self._children = other._children - delta = self._size - other._size + delta = other._size - self._size else: delta = 0 other_child: '_BoundedTrieNode' diff --git a/sdks/python/apache_beam/metrics/cells_test.py b/sdks/python/apache_beam/metrics/cells_test.py index 68e3f12a73fd..1cd15fced86c 100644 --- a/sdks/python/apache_beam/metrics/cells_test.py +++ b/sdks/python/apache_beam/metrics/cells_test.py @@ -420,6 +420,23 @@ def test_bounded_trie_data_combine_trim(self): BoundedTrieData(root=right, bound=3)).get_result(), set([('a', True), ('b', 'd', False), ('c', 'd', False)])) + def test_merge_on_empty_node(self): + root1 = _BoundedTrieNode() + root2 = _BoundedTrieNode() + root2.add_all([["a", "b", "c"], ["a", "b", "d"], ["a", "e"]]) + self.assertEqual(2, root1.merge(root2)) + self.assertEqual(3, root1.size()) + self.assertFalse(root1._truncated) + + def test_merge_with_empty_node(self): + root1 = _BoundedTrieNode() + root1.add_all([["a", "b", "c"], ["a", "b", "d"], ["a", "e"]]) + root2 = _BoundedTrieNode() + + self.assertEqual(0, root1.merge(root2)) + self.assertEqual(3, root1.size()) + self.assertFalse(root1._truncated) + if __name__ == '__main__': unittest.main() From 4c14921b344f7b1fabdf73ffbbb316527dab531d Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Thu, 2 Jan 2025 16:26:48 -0800 Subject: [PATCH 10/43] Better error message on incorrect package listing. --- sdks/python/apache_beam/yaml/yaml_provider.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/sdks/python/apache_beam/yaml/yaml_provider.py b/sdks/python/apache_beam/yaml/yaml_provider.py index 8dfa314aeb62..b3518c568653 100755 --- a/sdks/python/apache_beam/yaml/yaml_provider.py +++ b/sdks/python/apache_beam/yaml/yaml_provider.py @@ -36,6 +36,7 @@ from typing import Dict from typing import Iterable from typing import Iterator +from typing import List from typing import Mapping from typing import Optional @@ -349,7 +350,7 @@ def python(urns, packages=()): @ExternalProvider.register_provider_type('pythonPackage') class ExternalPythonProvider(ExternalProvider): - def __init__(self, urns, packages): + def __init__(self, urns, packages: Iterable[str]): super().__init__(urns, PypiExpansionService(packages)) def available(self): @@ -1017,26 +1018,31 @@ class PypiExpansionService: """ VENV_CACHE = os.path.expanduser("~/.apache_beam/cache/venvs") - def __init__(self, packages, base_python=sys.executable): - self._packages = packages + def __init__( + self, packages: Iterable[str], base_python: str = sys.executable): + if not isinstance(packages, Iterable) or isinstance(packages, str): + raise TypeError( + "Packages must be an iterable of strings, got %r" % packages) + self._packages = list(packages) self._base_python = base_python @classmethod - def _key(cls, base_python, packages): + def _key(cls, base_python: str, packages: List[str]) -> str: return json.dumps({ 'binary': base_python, 'packages': sorted(packages) }, sort_keys=True) @classmethod - def _path(cls, base_python, packages): + def _path(cls, base_python: str, packages: List[str]) -> str: return os.path.join( cls.VENV_CACHE, hashlib.sha256(cls._key(base_python, packages).encode('utf-8')).hexdigest()) @classmethod - def _create_venv_from_scratch(cls, base_python, packages): + def _create_venv_from_scratch( + cls, base_python: str, packages: List[str]) -> str: venv = cls._path(base_python, packages) if not os.path.exists(venv): try: @@ -1054,7 +1060,8 @@ def _create_venv_from_scratch(cls, base_python, packages): return venv @classmethod - def _create_venv_from_clone(cls, base_python, packages): + def _create_venv_from_clone( + cls, base_python: str, packages: List[str]) -> str: venv = cls._path(base_python, packages) if not os.path.exists(venv): try: @@ -1074,7 +1081,7 @@ def _create_venv_from_clone(cls, base_python, packages): return venv @classmethod - def _create_venv_to_clone(cls, base_python): + def _create_venv_to_clone(cls, base_python: str) -> str: if '.dev' in beam_version: base_venv = os.path.dirname(os.path.dirname(base_python)) print('Cloning dev environment from', base_venv) @@ -1085,7 +1092,7 @@ def _create_venv_to_clone(cls, base_python): 'virtualenv-clone' ]) - def _venv(self): + def _venv(self) -> str: return self._create_venv_from_clone(self._base_python, self._packages) def __enter__(self): From 7e6cf18fef72f8e460adc7cb9331ec5975fd29ef Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Fri, 3 Jan 2025 09:42:18 -0500 Subject: [PATCH 11/43] Update gcp bom to 26.50.0 (#33192) * Update bom to 26.50.0 * Remove null checks * Update gcs * Bump aws versions --- .../apache/beam/gradle/BeamModulePlugin.groovy | 16 ++++++++-------- .../container/license_scripts/dep_urls_java.yaml | 2 +- sdks/java/io/amazon-web-services2/build.gradle | 2 +- .../beam/sdk/io/gcp/firestore/FirestoreV1.java | 2 +- .../beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java | 5 ++--- .../beam/sdk/io/gcp/pubsub/PubsubJsonClient.java | 7 +------ 6 files changed, 14 insertions(+), 20 deletions(-) diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index 51157e2e3166..7b791ef9aa8e 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -594,7 +594,7 @@ class BeamModulePlugin implements Plugin { def activemq_version = "5.14.5" def autovalue_version = "1.9" def autoservice_version = "1.0.1" - def aws_java_sdk2_version = "2.20.47" + def aws_java_sdk2_version = "2.20.162" def cassandra_driver_version = "3.10.2" def cdap_version = "6.5.1" def checkerframework_version = "3.42.0" @@ -602,12 +602,12 @@ class BeamModulePlugin implements Plugin { def dbcp2_version = "2.9.0" def errorprone_version = "2.10.0" // [bomupgrader] determined by: com.google.api:gax, consistent with: google_cloud_platform_libraries_bom - def gax_version = "2.55.0" + def gax_version = "2.57.0" def google_ads_version = "33.0.0" def google_clients_version = "2.0.0" def google_cloud_bigdataoss_version = "2.2.26" // [bomupgrader] determined by: com.google.cloud:google-cloud-spanner, consistent with: google_cloud_platform_libraries_bom - def google_cloud_spanner_version = "6.79.0" + def google_cloud_spanner_version = "6.80.1" def google_code_gson_version = "2.10.1" def google_oauth_clients_version = "1.34.1" // [bomupgrader] determined by: io.grpc:grpc-netty, consistent with: google_cloud_platform_libraries_bom @@ -630,7 +630,7 @@ class BeamModulePlugin implements Plugin { def postgres_version = "42.2.16" def powermock_version = "2.0.9" // [bomupgrader] determined by: com.google.protobuf:protobuf-java, consistent with: google_cloud_platform_libraries_bom - def protobuf_version = "3.25.5" + def protobuf_version = "4.28.3" def qpid_jms_client_version = "0.61.0" def quickcheck_version = "1.0" def sbe_tool_version = "1.25.1" @@ -728,12 +728,12 @@ class BeamModulePlugin implements Plugin { google_api_client_gson : "com.google.api-client:google-api-client-gson:$google_clients_version", google_api_client_java6 : "com.google.api-client:google-api-client-java6:$google_clients_version", google_api_common : "com.google.api:api-common", // google_cloud_platform_libraries_bom sets version - google_api_services_bigquery : "com.google.apis:google-api-services-bigquery:v2-rev20240919-2.0.0", // [bomupgrader] sets version + google_api_services_bigquery : "com.google.apis:google-api-services-bigquery:v2-rev20241013-2.0.0", // [bomupgrader] sets version google_api_services_cloudresourcemanager : "com.google.apis:google-api-services-cloudresourcemanager:v1-rev20240310-2.0.0", // [bomupgrader] sets version google_api_services_dataflow : "com.google.apis:google-api-services-dataflow:v1b3-rev20240817-$google_clients_version", google_api_services_healthcare : "com.google.apis:google-api-services-healthcare:v1-rev20240130-$google_clients_version", google_api_services_pubsub : "com.google.apis:google-api-services-pubsub:v1-rev20220904-$google_clients_version", - google_api_services_storage : "com.google.apis:google-api-services-storage:v1-rev20240924-2.0.0", // [bomupgrader] sets version + google_api_services_storage : "com.google.apis:google-api-services-storage:v1-rev20241008-2.0.0", // [bomupgrader] sets version google_auth_library_credentials : "com.google.auth:google-auth-library-credentials", // google_cloud_platform_libraries_bom sets version google_auth_library_oauth2_http : "com.google.auth:google-auth-library-oauth2-http", // google_cloud_platform_libraries_bom sets version google_cloud_bigquery : "com.google.cloud:google-cloud-bigquery", // google_cloud_platform_libraries_bom sets version @@ -745,13 +745,13 @@ class BeamModulePlugin implements Plugin { google_cloud_core_grpc : "com.google.cloud:google-cloud-core-grpc", // google_cloud_platform_libraries_bom sets version google_cloud_datacatalog_v1beta1 : "com.google.cloud:google-cloud-datacatalog", // google_cloud_platform_libraries_bom sets version google_cloud_dataflow_java_proto_library_all: "com.google.cloud.dataflow:google-cloud-dataflow-java-proto-library-all:0.5.160304", - google_cloud_datastore_v1_proto_client : "com.google.cloud.datastore:datastore-v1-proto-client:2.23.0", // [bomupgrader] sets version + google_cloud_datastore_v1_proto_client : "com.google.cloud.datastore:datastore-v1-proto-client:2.24.1", // [bomupgrader] sets version google_cloud_firestore : "com.google.cloud:google-cloud-firestore", // google_cloud_platform_libraries_bom sets version google_cloud_pubsub : "com.google.cloud:google-cloud-pubsub", // google_cloud_platform_libraries_bom sets version google_cloud_pubsublite : "com.google.cloud:google-cloud-pubsublite", // google_cloud_platform_libraries_bom sets version // [bomupgrader] the BOM version is set by scripts/tools/bomupgrader.py. If update manually, also update // libraries-bom version on sdks/java/container/license_scripts/dep_urls_java.yaml - google_cloud_platform_libraries_bom : "com.google.cloud:libraries-bom:26.49.0", + google_cloud_platform_libraries_bom : "com.google.cloud:libraries-bom:26.50.0", google_cloud_secret_manager : "com.google.cloud:google-cloud-secretmanager", // google_cloud_platform_libraries_bom sets version google_cloud_spanner : "com.google.cloud:google-cloud-spanner", // google_cloud_platform_libraries_bom sets version google_cloud_spanner_test : "com.google.cloud:google-cloud-spanner:$google_cloud_spanner_version:tests", diff --git a/sdks/java/container/license_scripts/dep_urls_java.yaml b/sdks/java/container/license_scripts/dep_urls_java.yaml index 781a0decda78..cdb625bea447 100644 --- a/sdks/java/container/license_scripts/dep_urls_java.yaml +++ b/sdks/java/container/license_scripts/dep_urls_java.yaml @@ -46,7 +46,7 @@ jaxen: '1.1.6': type: "3-Clause BSD" libraries-bom: - '26.49.0': + '26.50.0': license: "https://raw.githubusercontent.com/GoogleCloudPlatform/cloud-opensource-java/master/LICENSE" type: "Apache License 2.0" paranamer: diff --git a/sdks/java/io/amazon-web-services2/build.gradle b/sdks/java/io/amazon-web-services2/build.gradle index b1b5e3abee69..122fa18de895 100644 --- a/sdks/java/io/amazon-web-services2/build.gradle +++ b/sdks/java/io/amazon-web-services2/build.gradle @@ -52,7 +52,7 @@ dependencies { implementation library.java.aws_java_sdk2_http_client_spi, excludeNetty implementation library.java.aws_java_sdk2_apache_client, excludeNetty implementation library.java.aws_java_sdk2_netty_client, excludeNetty - implementation("software.amazon.kinesis:amazon-kinesis-client:2.4.8") { + implementation("software.amazon.kinesis:amazon-kinesis-client:3.0.1") { // Note: The KCL client isn't used. However, unfortunately, some model classes of KCL leak into the // KinesisIO API (KinesisClientRecord, InitialPositionInStream). Additionally, KinesisIO // internally uses KCL utils to generate aggregated messages and de-aggregate them. diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreV1.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreV1.java index 37d0227555b7..e9d0709343f5 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreV1.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreV1.java @@ -1197,7 +1197,7 @@ static final class PartitionQueryResponseToRunQueryRequest .filter( v -> { String referenceValue = v.getReferenceValue(); - return referenceValue != null && !referenceValue.isEmpty(); + return !referenceValue.isEmpty(); }) .findFirst(); Function stringToPath = (String s) -> s.split("/"); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java index 0cfb06688108..de6c14722898 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java @@ -269,13 +269,12 @@ public List pull( List incomingMessages = new ArrayList<>(response.getReceivedMessagesCount()); for (ReceivedMessage message : response.getReceivedMessagesList()) { PubsubMessage pubsubMessage = message.getMessage(); - @Nullable Map attributes = pubsubMessage.getAttributes(); + Map attributes = pubsubMessage.getAttributes(); // Timestamp. long timestampMsSinceEpoch; if (Strings.isNullOrEmpty(timestampAttribute)) { Timestamp timestampProto = pubsubMessage.getPublishTime(); - checkArgument(timestampProto != null, "Pubsub message is missing timestamp proto"); timestampMsSinceEpoch = timestampProto.getSeconds() * 1000 + timestampProto.getNanos() / 1000L / 1000L; } else { @@ -288,7 +287,7 @@ public List pull( // Record id, if any. @Nullable String recordId = null; - if (idAttribute != null && attributes != null) { + if (idAttribute != null) { recordId = attributes.get(idAttribute); } if (Strings.isNullOrEmpty(recordId)) { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java index 0a838da66f69..a12c64ff9a9b 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java @@ -158,12 +158,7 @@ public int publish(TopicPath topic, List outgoingMessages) thro } private Map getMessageAttributes(OutgoingMessage outgoingMessage) { - Map attributes = null; - if (outgoingMessage.getMessage().getAttributesMap() == null) { - attributes = new TreeMap<>(); - } else { - attributes = new TreeMap<>(outgoingMessage.getMessage().getAttributesMap()); - } + Map attributes = new TreeMap<>(outgoingMessage.getMessage().getAttributesMap()); if (timestampAttribute != null) { attributes.put( timestampAttribute, String.valueOf(outgoingMessage.getTimestampMsSinceEpoch())); From f27547de5dde0c802350d10d358cd86ea98bc9d1 Mon Sep 17 00:00:00 2001 From: Damon Date: Fri, 3 Jan 2025 10:11:55 -0800 Subject: [PATCH 12/43] [#32167][Prism] Use the worker-id gRPC metadata (#33438) * Implement MultiplexW and Pool * Add missing license header * Add multiplex worker to prism execute * remove unused props * Fix Prism python precommit * Handle worker_id is empty string error * Fix python worker id interceptor * default empty _worker_id * Revert defaulting worker id * Fix worker_id in docker env * Update per PR comments * Add lock/unlock to MultiplexW * Delegate W deletion via MW * Remove unnecessary guard * Small fixes after PR review * Add code comment to MakeWorker * clean up commented out code * Revert portable/common changes --- .../runners/prism/internal/environments.go | 2 +- .../beam/runners/prism/internal/execute.go | 46 ++--- .../runners/prism/internal/jobservices/job.go | 13 ++ .../prism/internal/jobservices/management.go | 1 + .../prism/internal/jobservices/server.go | 6 + .../prism/internal/worker/bundle_test.go | 2 +- .../runners/prism/internal/worker/worker.go | 187 +++++++++++++----- .../prism/internal/worker/worker_test.go | 95 +++++++-- 8 files changed, 249 insertions(+), 103 deletions(-) diff --git a/sdks/go/pkg/beam/runners/prism/internal/environments.go b/sdks/go/pkg/beam/runners/prism/internal/environments.go index 2f960a04f0cb..be4809f5e2f6 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/environments.go +++ b/sdks/go/pkg/beam/runners/prism/internal/environments.go @@ -147,7 +147,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock ccr, err := cli.ContainerCreate(ctx, &container.Config{ Image: dp.GetContainerImage(), Cmd: []string{ - fmt.Sprintf("--id=%v-%v", wk.JobKey, wk.Env), + fmt.Sprintf("--id=%v", wk.ID), fmt.Sprintf("--control_endpoint=%v", wk.Endpoint()), fmt.Sprintf("--artifact_endpoint=%v", artifactEndpoint), fmt.Sprintf("--provision_endpoint=%v", wk.Endpoint()), diff --git a/sdks/go/pkg/beam/runners/prism/internal/execute.go b/sdks/go/pkg/beam/runners/prism/internal/execute.go index 8b56c30eb61b..2cc62769d2a9 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/execute.go +++ b/sdks/go/pkg/beam/runners/prism/internal/execute.go @@ -53,13 +53,24 @@ func RunPipeline(j *jobservices.Job) { envs := j.Pipeline.GetComponents().GetEnvironments() wks := map[string]*worker.W{} for envID := range envs { - wk, err := makeWorker(envID, j) - if err != nil { - j.Failed(err) + wk := j.MakeWorker(envID) + wks[envID] = wk + if err := runEnvironment(j.RootCtx, j, envID, wk); err != nil { + j.Failed(fmt.Errorf("failed to start environment %v for job %v: %w", envID, j, err)) return } - wks[envID] = wk + // Check for connection succeeding after we've created the environment successfully. + timeout := 1 * time.Minute + time.AfterFunc(timeout, func() { + if wk.Connected() || wk.Stopped() { + return + } + err := fmt.Errorf("prism %v didn't get control connection to %v after %v", wk, wk.Endpoint(), timeout) + j.Failed(err) + j.CancelFn(err) + }) } + // When this function exits, we cancel the context to clear // any related job resources. defer func() { @@ -86,33 +97,6 @@ func RunPipeline(j *jobservices.Job) { j.Done() } -// makeWorker creates a worker for that environment. -func makeWorker(env string, j *jobservices.Job) (*worker.W, error) { - wk := worker.New(j.String()+"_"+env, env) - - wk.EnvPb = j.Pipeline.GetComponents().GetEnvironments()[env] - wk.PipelineOptions = j.PipelineOptions() - wk.JobKey = j.JobKey() - wk.ArtifactEndpoint = j.ArtifactEndpoint() - - go wk.Serve() - - if err := runEnvironment(j.RootCtx, j, env, wk); err != nil { - return nil, fmt.Errorf("failed to start environment %v for job %v: %w", env, j, err) - } - // Check for connection succeeding after we've created the environment successfully. - timeout := 1 * time.Minute - time.AfterFunc(timeout, func() { - if wk.Connected() || wk.Stopped() { - return - } - err := fmt.Errorf("prism %v didn't get control connection to %v after %v", wk, wk.Endpoint(), timeout) - j.Failed(err) - j.CancelFn(err) - }) - return wk, nil -} - type transformExecuter interface { ExecuteUrns() []string ExecuteTransform(stageID, tid string, t *pipepb.PTransform, comps *pipepb.Components, watermark mtime.Time, data [][]byte) *worker.B diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go index 6158cd6d612c..4be64e5a9c80 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go @@ -38,6 +38,7 @@ import ( jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism/internal/urns" + "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism/internal/worker" "google.golang.org/protobuf/types/known/structpb" ) @@ -93,6 +94,7 @@ type Job struct { Logger *slog.Logger metrics metricsStore + mw *worker.MultiplexW } func (j *Job) ArtifactEndpoint() string { @@ -198,3 +200,14 @@ func (j *Job) Failed(err error) { j.sendState(jobpb.JobState_FAILED) j.CancelFn(fmt.Errorf("jobFailed %v: %w", j, err)) } + +// MakeWorker instantiates a worker.W populating environment and pipeline data from the Job. +func (j *Job) MakeWorker(env string) *worker.W { + wk := j.mw.MakeWorker(j.String()+"_"+env, env) + wk.EnvPb = j.Pipeline.GetComponents().GetEnvironments()[env] + wk.PipelineOptions = j.PipelineOptions() + wk.JobKey = j.JobKey() + wk.ArtifactEndpoint = j.ArtifactEndpoint() + + return wk +} diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go index af559a92ab46..b9a28e4bc652 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go @@ -94,6 +94,7 @@ func (s *Server) Prepare(ctx context.Context, req *jobpb.PrepareJobRequest) (_ * }, Logger: s.logger, // TODO substitute with a configured logger. artifactEndpoint: s.Endpoint(), + mw: s.mw, } // Stop the idle timer when a new job appears. if idleTimer := s.idleTimer.Load(); idleTimer != nil { diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/server.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/server.go index bdfe2aff2dd4..fb55fc54bf93 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/server.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/server.go @@ -28,6 +28,7 @@ import ( fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" + "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism/internal/worker" "google.golang.org/grpc" ) @@ -60,6 +61,8 @@ type Server struct { // Artifact hack artifacts map[string][]byte + + mw *worker.MultiplexW } // NewServer acquires the indicated port. @@ -82,6 +85,9 @@ func NewServer(port int, execute func(*Job)) *Server { jobpb.RegisterJobServiceServer(s.server, s) jobpb.RegisterArtifactStagingServiceServer(s.server, s) jobpb.RegisterArtifactRetrievalServiceServer(s.server, s) + + s.mw = worker.NewMultiplexW(lis, s.server, s.logger) + return s } diff --git a/sdks/go/pkg/beam/runners/prism/internal/worker/bundle_test.go b/sdks/go/pkg/beam/runners/prism/internal/worker/bundle_test.go index 161fb199ce96..08d30f67e445 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/worker/bundle_test.go +++ b/sdks/go/pkg/beam/runners/prism/internal/worker/bundle_test.go @@ -25,7 +25,7 @@ import ( ) func TestBundle_ProcessOn(t *testing.T) { - wk := New("test", "testEnv") + wk := newWorker() b := &B{ InstID: "testInst", PBDID: "testPBDID", diff --git a/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go b/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go index 9d9058975b26..b4133b0332a6 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go +++ b/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go @@ -23,11 +23,9 @@ import ( "fmt" "io" "log/slog" - "math" "net" "sync" "sync/atomic" - "time" "github.com/apache/beam/sdks/v2/go/pkg/beam/core" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/coder" @@ -38,6 +36,7 @@ import ( pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism/internal/engine" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism/internal/urns" + "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -55,16 +54,14 @@ type W struct { fnpb.UnimplementedBeamFnLoggingServer fnpb.UnimplementedProvisionServiceServer + parentPool *MultiplexW + ID, Env string JobKey, ArtifactEndpoint string EnvPb *pipepb.Environment PipelineOptions *structpb.Struct - // Server management - lis net.Listener - server *grpc.Server - // These are the ID sources inst uint64 connected, stopped atomic.Bool @@ -82,45 +79,8 @@ type controlResponder interface { Respond(*fnpb.InstructionResponse) } -// New starts the worker server components of FnAPI Execution. -func New(id, env string) *W { - lis, err := net.Listen("tcp", ":0") - if err != nil { - panic(fmt.Sprintf("failed to listen: %v", err)) - } - opts := []grpc.ServerOption{ - grpc.MaxRecvMsgSize(math.MaxInt32), - } - wk := &W{ - ID: id, - Env: env, - lis: lis, - server: grpc.NewServer(opts...), - - InstReqs: make(chan *fnpb.InstructionRequest, 10), - DataReqs: make(chan *fnpb.Elements, 10), - StoppedChan: make(chan struct{}), - - activeInstructions: make(map[string]controlResponder), - Descriptors: make(map[string]*fnpb.ProcessBundleDescriptor), - } - slog.Debug("Serving Worker components", slog.String("endpoint", wk.Endpoint())) - fnpb.RegisterBeamFnControlServer(wk.server, wk) - fnpb.RegisterBeamFnDataServer(wk.server, wk) - fnpb.RegisterBeamFnLoggingServer(wk.server, wk) - fnpb.RegisterBeamFnStateServer(wk.server, wk) - fnpb.RegisterProvisionServiceServer(wk.server, wk) - return wk -} - func (wk *W) Endpoint() string { - _, port, _ := net.SplitHostPort(wk.lis.Addr().String()) - return fmt.Sprintf("localhost:%v", port) -} - -// Serve serves on the started listener. Blocks. -func (wk *W) Serve() { - wk.server.Serve(wk.lis) + return wk.parentPool.endpoint } func (wk *W) String() string { @@ -154,16 +114,7 @@ func (wk *W) shutdown() { // Stop the GRPC server. func (wk *W) Stop() { wk.shutdown() - - // Give the SDK side 5 seconds to gracefully stop, before - // hard stopping all RPCs. - tim := time.AfterFunc(5*time.Second, func() { - wk.server.Stop() - }) - wk.server.GracefulStop() - tim.Stop() - - wk.lis.Close() + wk.parentPool.delete(wk) slog.Debug("stopped", "worker", wk) } @@ -710,3 +661,131 @@ func (wk *W) MonitoringMetadata(ctx context.Context, unknownIDs []string) *fnpb. }, }).GetMonitoringInfos() } + +// MultiplexW forwards FnAPI gRPC requests to W it manages in an in-memory pool. +type MultiplexW struct { + fnpb.UnimplementedBeamFnControlServer + fnpb.UnimplementedBeamFnDataServer + fnpb.UnimplementedBeamFnStateServer + fnpb.UnimplementedBeamFnLoggingServer + fnpb.UnimplementedProvisionServiceServer + + mu sync.Mutex + endpoint string + logger *slog.Logger + pool map[string]*W +} + +// NewMultiplexW instantiates a new FnAPI server for multiplexing FnAPI requests to a W. +func NewMultiplexW(lis net.Listener, g *grpc.Server, logger *slog.Logger) *MultiplexW { + _, p, _ := net.SplitHostPort(lis.Addr().String()) + mw := &MultiplexW{ + endpoint: "localhost:" + p, + logger: logger, + pool: make(map[string]*W), + } + + fnpb.RegisterBeamFnControlServer(g, mw) + fnpb.RegisterBeamFnDataServer(g, mw) + fnpb.RegisterBeamFnLoggingServer(g, mw) + fnpb.RegisterBeamFnStateServer(g, mw) + fnpb.RegisterProvisionServiceServer(g, mw) + + return mw +} + +// MakeWorker creates and registers a W, assigning id and env to W.ID and W.Env, respectively, associating W.ID +// to *W for later lookup. MultiplexW expects FnAPI gRPC requests to contain a matching 'worker_id' in its context +// metadata. A gRPC client should use the grpcx.WriteWorkerID helper method prior to sending the request. +func (mw *MultiplexW) MakeWorker(id, env string) *W { + mw.mu.Lock() + defer mw.mu.Unlock() + w := &W{ + ID: id, + Env: env, + + InstReqs: make(chan *fnpb.InstructionRequest, 10), + DataReqs: make(chan *fnpb.Elements, 10), + StoppedChan: make(chan struct{}), + + activeInstructions: make(map[string]controlResponder), + Descriptors: make(map[string]*fnpb.ProcessBundleDescriptor), + parentPool: mw, + } + mw.pool[id] = w + return w +} + +func (mw *MultiplexW) GetProvisionInfo(ctx context.Context, req *fnpb.GetProvisionInfoRequest) (*fnpb.GetProvisionInfoResponse, error) { + return handleUnary(mw, ctx, req, (*W).GetProvisionInfo) +} + +func (mw *MultiplexW) Logging(stream fnpb.BeamFnLogging_LoggingServer) error { + return handleStream(mw, stream.Context(), stream, (*W).Logging) +} + +func (mw *MultiplexW) GetProcessBundleDescriptor(ctx context.Context, req *fnpb.GetProcessBundleDescriptorRequest) (*fnpb.ProcessBundleDescriptor, error) { + return handleUnary(mw, ctx, req, (*W).GetProcessBundleDescriptor) +} + +func (mw *MultiplexW) Control(ctrl fnpb.BeamFnControl_ControlServer) error { + return handleStream(mw, ctrl.Context(), ctrl, (*W).Control) +} + +func (mw *MultiplexW) Data(data fnpb.BeamFnData_DataServer) error { + return handleStream(mw, data.Context(), data, (*W).Data) +} + +func (mw *MultiplexW) State(state fnpb.BeamFnState_StateServer) error { + return handleStream(mw, state.Context(), state, (*W).State) +} + +func (mw *MultiplexW) MonitoringMetadata(ctx context.Context, unknownIDs []string) *fnpb.MonitoringInfosMetadataResponse { + mw.mu.Lock() + defer mw.mu.Unlock() + w, err := mw.workerFromMetadataCtx(ctx) + if err != nil { + mw.logger.Error(err.Error()) + return nil + } + return w.MonitoringMetadata(ctx, unknownIDs) +} + +func (mw *MultiplexW) workerFromMetadataCtx(ctx context.Context) (*W, error) { + mw.mu.Lock() + defer mw.mu.Unlock() + id, err := grpcx.ReadWorkerID(ctx) + if err != nil { + return nil, err + } + if id == "" { + return nil, fmt.Errorf("worker_id read from context metadata is an empty string") + } + w, ok := mw.pool[id] + if !ok { + return nil, fmt.Errorf("worker_id: '%s' read from context metadata but not registered in worker pool", id) + } + return w, nil +} + +func (mw *MultiplexW) delete(w *W) { + mw.mu.Lock() + defer mw.mu.Unlock() + delete(mw.pool, w.ID) +} + +func handleUnary[Request any, Response any, Method func(*W, context.Context, *Request) (*Response, error)](mw *MultiplexW, ctx context.Context, req *Request, m Method) (*Response, error) { + w, err := mw.workerFromMetadataCtx(ctx) + if err != nil { + return nil, err + } + return m(w, ctx, req) +} + +func handleStream[Stream any, Method func(*W, Stream) error](mw *MultiplexW, ctx context.Context, stream Stream, m Method) error { + w, err := mw.workerFromMetadataCtx(ctx) + if err != nil { + return err + } + return m(w, stream) +} diff --git a/sdks/go/pkg/beam/runners/prism/internal/worker/worker_test.go b/sdks/go/pkg/beam/runners/prism/internal/worker/worker_test.go index 469e0e2f3d83..a0cf577fbdba 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/worker/worker_test.go +++ b/sdks/go/pkg/beam/runners/prism/internal/worker/worker_test.go @@ -18,34 +18,88 @@ package worker import ( "bytes" "context" + "log/slog" "net" "sort" "sync" "testing" "time" - "github.com/google/go-cmp/cmp" - "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/coder" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/exec" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/typex" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism/internal/engine" + "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" + "github.com/google/go-cmp/cmp" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/metadata" "google.golang.org/grpc/test/bufconn" ) -func TestWorker_New(t *testing.T) { - w := New("test", "testEnv") +func TestMultiplexW_MakeWorker(t *testing.T) { + w := newWorker() + if w.parentPool == nil { + t.Errorf("MakeWorker instantiated W with a nil reference to MultiplexW") + } if got, want := w.ID, "test"; got != want { - t.Errorf("New(%q) = %v, want %v", want, got, want) + t.Errorf("MakeWorker(%q) = %v, want %v", want, got, want) + } + got, ok := w.parentPool.pool[w.ID] + if !ok || got == nil { + t.Errorf("MakeWorker(%q) not registered in worker pool %v", w.ID, w.parentPool.pool) + } +} + +func TestMultiplexW_workerFromMetadataCtx(t *testing.T) { + for _, tt := range []struct { + name string + ctx context.Context + want *W + wantErr string + }{ + { + name: "empty ctx metadata", + ctx: context.Background(), + wantErr: "failed to read metadata from context", + }, + { + name: "worker_id empty", + ctx: metadata.NewIncomingContext(context.Background(), metadata.Pairs("worker_id", "")), + wantErr: "worker_id read from context metadata is an empty string", + }, + { + name: "mismatched worker_id", + ctx: metadata.NewIncomingContext(context.Background(), metadata.Pairs("worker_id", "doesn't exist")), + wantErr: "worker_id: 'doesn't exist' read from context metadata but not registered in worker pool", + }, + { + name: "matched worker_id", + ctx: metadata.NewIncomingContext(context.Background(), metadata.Pairs("worker_id", "test")), + want: &W{ID: "test"}, + }, + } { + t.Run(tt.name, func(t *testing.T) { + w := newWorker() + got, err := w.parentPool.workerFromMetadataCtx(tt.ctx) + if err != nil && err.Error() != tt.wantErr { + t.Errorf("workerFromMetadataCtx() error = %v, wantErr %v", err, tt.wantErr) + return + } + if tt.wantErr != "" { + return + } + if got.ID != tt.want.ID { + t.Errorf("workerFromMetadataCtx() id = %v, want %v", got.ID, tt.want.ID) + } + }) } } func TestWorker_NextInst(t *testing.T) { - w := New("test", "testEnv") + w := newWorker() instIDs := map[string]struct{}{} for i := 0; i < 100; i++ { @@ -57,7 +111,7 @@ func TestWorker_NextInst(t *testing.T) { } func TestWorker_GetProcessBundleDescriptor(t *testing.T) { - w := New("test", "testEnv") + w := newWorker() id := "available" w.Descriptors[id] = &fnpb.ProcessBundleDescriptor{ @@ -87,19 +141,21 @@ func serveTestWorker(t *testing.T) (context.Context, *W, *grpc.ClientConn) { ctx, cancelFn := context.WithCancel(context.Background()) t.Cleanup(cancelFn) - w := New("test", "testEnv") + g := grpc.NewServer() lis := bufconn.Listen(2048) - w.lis = lis - t.Cleanup(func() { w.Stop() }) - go w.Serve() - - clientConn, err := grpc.DialContext(ctx, "", grpc.WithContextDialer(func(ctx context.Context, _ string) (net.Conn, error) { - return lis.DialContext(ctx) - }), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithBlock()) + mw := NewMultiplexW(lis, g, slog.Default()) + t.Cleanup(func() { g.Stop() }) + go g.Serve(lis) + w := mw.MakeWorker("test", "testEnv") + ctx = metadata.NewIncomingContext(ctx, metadata.Pairs("worker_id", w.ID)) + ctx = grpcx.WriteWorkerID(ctx, w.ID) + conn, err := grpc.DialContext(ctx, w.Endpoint(), grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { + return lis.Dial() + }), grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { t.Fatal("couldn't create bufconn grpc connection:", err) } - return ctx, w, clientConn + return ctx, w, conn } type closeSend func() @@ -465,3 +521,10 @@ func TestWorker_State_MultimapSideInput(t *testing.T) { }) } } + +func newWorker() *W { + mw := &MultiplexW{ + pool: map[string]*W{}, + } + return mw.MakeWorker("test", "testEnv") +} From 4a3248282c650735afe9d3330c698649d9d702c8 Mon Sep 17 00:00:00 2001 From: Damon Date: Fri, 3 Jan 2025 11:16:19 -0800 Subject: [PATCH 13/43] [Distroless] Publish Java SDK distroless variants (#33485) * Add docker/push step * Fix missing tag error * Try cli options full name * Update beam_Publish_Java_SDK_Distroless_Snapshots.yml * Update beam_Publish_Java_SDK_Distroless_Snapshots.yml --- .../beam_Publish_Java_SDK_Distroless_Snapshots.yml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/beam_Publish_Java_SDK_Distroless_Snapshots.yml b/.github/workflows/beam_Publish_Java_SDK_Distroless_Snapshots.yml index 944648fc919b..74a0f9a81d63 100644 --- a/.github/workflows/beam_Publish_Java_SDK_Distroless_Snapshots.yml +++ b/.github/workflows/beam_Publish_Java_SDK_Distroless_Snapshots.yml @@ -83,5 +83,13 @@ jobs: - name: GCloud Docker credential helper run: | gcloud auth configure-docker ${{ env.docker_registry }} - # TODO(https://github.com/apache/beam/issues/33201): create after merging into main branch - # - name: Build and push Java distroless image + - name: Build and push Java distroless image + run: | + docker buildx build --push \ + -t gcr.io/apache-beam-testing/beam-sdk/beam_${{ matrix.java_version }}_sdk_distroless:${{ github.sha }} \ + -t gcr.io/apache-beam-testing/beam-sdk/beam_${{ matrix.java_version }}_sdk_distroless:${BEAM_VERSION} \ + -t gcr.io/apache-beam-testing/beam-sdk/beam_${{ matrix.java_version }}_sdk_distroless:latest \ + -f sdks/java/container/Dockerfile-distroless \ + --build-arg=BEAM_BASE=gcr.io/apache-beam-testing/beam-sdk/beam_${{ matrix.java_version }}_sdk:${BEAM_VERSION} \ + --build-arg=DISTROLESS_BASE=gcr.io/distroless/${{ matrix.java_version }}-debian12 \ + . From 05473f5dd7e88b45b280da2fc0598c6f974dbcf8 Mon Sep 17 00:00:00 2001 From: Shunping Huang Date: Sat, 4 Jan 2025 15:52:49 -0500 Subject: [PATCH 14/43] Use other fallback coders for protobuf message base class (#33432) * Use other fallback coders for protobuf Message base class. * Minor change on comments * Revise the comments based on review. * Move import out of if condition. * Fix lints --- sdks/python/apache_beam/coders/coders.py | 13 ++++++++++--- sdks/python/apache_beam/coders/coders_test.py | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/sdks/python/apache_beam/coders/coders.py b/sdks/python/apache_beam/coders/coders.py index 57d8197a3a00..0f2a42686854 100644 --- a/sdks/python/apache_beam/coders/coders.py +++ b/sdks/python/apache_beam/coders/coders.py @@ -55,6 +55,7 @@ import google.protobuf.wrappers_pb2 import proto +from google.protobuf import message from apache_beam.coders import coder_impl from apache_beam.coders.avro_record import AvroRecord @@ -65,7 +66,6 @@ from apache_beam.utils import proto_utils if TYPE_CHECKING: - from google.protobuf import message # pylint: disable=ungrouped-imports from apache_beam.coders.typecoders import CoderRegistry from apache_beam.runners.pipeline_context import PipelineContext @@ -1039,11 +1039,18 @@ def __hash__(self): @classmethod def from_type_hint(cls, typehint, unused_registry): - if issubclass(typehint, proto_utils.message_types): + # The typehint must be a strict subclass of google.protobuf.message.Message. + # ProtoCoder cannot work with message.Message itself, as deserialization of + # a serialized proto requires knowledge of the desired concrete proto + # subclass which is not stored in the encoded bytes themselves. If this + # occurs, an error is raised and the system defaults to other fallback + # coders. + if (issubclass(typehint, proto_utils.message_types) and + typehint != message.Message): return cls(typehint) else: raise ValueError(( - 'Expected a subclass of google.protobuf.message.Message' + 'Expected a strict subclass of google.protobuf.message.Message' ', but got a %s' % typehint)) def to_type_hint(self): diff --git a/sdks/python/apache_beam/coders/coders_test.py b/sdks/python/apache_beam/coders/coders_test.py index dc9780e36be3..5e5debca36e6 100644 --- a/sdks/python/apache_beam/coders/coders_test.py +++ b/sdks/python/apache_beam/coders/coders_test.py @@ -22,6 +22,7 @@ import proto import pytest +from google.protobuf import message import apache_beam as beam from apache_beam import typehints @@ -86,6 +87,23 @@ def test_proto_coder(self): self.assertEqual(ma, real_coder.decode(real_coder.encode(ma))) self.assertEqual(ma.__class__, real_coder.to_type_hint()) + def test_proto_coder_on_protobuf_message_subclasses(self): + # This replicates a scenario where users provide message.Message as the + # output typehint for a Map function, even though the actual output messages + # are subclasses of message.Message. + ma = test_message.MessageA() + mb = ma.field2.add() + mb.field1 = True + ma.field1 = 'hello world' + + coder = coders_registry.get_coder(message.Message) + # For messages of google.protobuf.message.Message, the fallback coder will + # be FastPrimitivesCoder rather than ProtoCoder. + # See the comment on ProtoCoder.from_type_hint() for further details. + self.assertEqual(coder, coders.FastPrimitivesCoder()) + + self.assertEqual(ma, coder.decode(coder.encode(ma))) + class DeterministicProtoCoderTest(unittest.TestCase): def test_deterministic_proto_coder(self): From d5248c1424da3a8356ddc225af7ee7d9cead0992 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 5 Jan 2025 21:08:15 -0800 Subject: [PATCH 15/43] Bump github.com/aws/aws-sdk-go-v2/service/s3 in /sdks (#33500) Bumps [github.com/aws/aws-sdk-go-v2/service/s3](https://github.com/aws/aws-sdk-go-v2) from 1.71.0 to 1.72.0. - [Release notes](https://github.com/aws/aws-sdk-go-v2/releases) - [Commits](https://github.com/aws/aws-sdk-go-v2/compare/service/s3/v1.71.0...service/s3/v1.72.0) --- updated-dependencies: - dependency-name: github.com/aws/aws-sdk-go-v2/service/s3 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 8 ++++---- sdks/go.sum | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index f96d4f7fd1bf..27ec2236c9ca 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -34,7 +34,7 @@ require ( github.com/aws/aws-sdk-go-v2/config v1.28.7 github.com/aws/aws-sdk-go-v2/credentials v1.17.48 github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.43 - github.com/aws/aws-sdk-go-v2/service/s3 v1.71.0 + github.com/aws/aws-sdk-go-v2/service/s3 v1.72.0 github.com/aws/smithy-go v1.22.1 github.com/docker/go-connections v0.5.0 github.com/dustin/go-humanize v1.0.1 @@ -136,11 +136,11 @@ require ( github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.26 // indirect github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.26 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 // indirect - github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.25 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.26 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.1 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.6 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.7 // indirect github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.7 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.6 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.7 // indirect github.com/aws/aws-sdk-go-v2/service/sso v1.24.8 // indirect github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.7 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.33.3 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index 72658b7e74dc..907756d3e05e 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -712,22 +712,22 @@ github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.26/go.mod h1:3o2Wpy0bogG github.com/aws/aws-sdk-go-v2/internal/ini v1.1.1/go.mod h1:Zy8smImhTdOETZqfyn01iNOe0CNggVbPjCajyaz6Gvg= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 h1:VaRN3TlFdd6KxX1x3ILT5ynH6HvKgqdiXoTxAF4HQcQ= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1/go.mod h1:FbtygfRFze9usAadmnGJNc8KsP346kEe+y2/oyhGAGc= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.25 h1:r67ps7oHCYnflpgDy2LZU0MAQtQbYIOqNNnqGO6xQkE= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.25/go.mod h1:GrGY+Q4fIokYLtjCVB/aFfCVL6hhGUFl8inD18fDalE= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.26 h1:GeNJsIFHB+WW5ap2Tec4K6dzcVTsRbsT1Lra46Hv9ME= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.26/go.mod h1:zfgMpwHDXX2WGoG84xG2H+ZlPTkJUU4YUvx2svLQYWo= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.2.1/go.mod h1:v33JQ57i2nekYTA70Mb+O18KeH4KqhdqxTJZNK1zdRE= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.1 h1:iXtILhvDxB6kPvEXgsDhGaZCSC6LQET5ZHSdJozeI0Y= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.1/go.mod h1:9nu0fVANtYiAePIBh2/pFUSwtJ402hLnp854CNoDOeE= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.6 h1:HCpPsWqmYQieU7SS6E9HXfdAMSud0pteVXieJmcpIRI= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.6/go.mod h1:ngUiVRCco++u+soRRVBIvBZxSMMvOVMXA4PJ36JLfSw= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.7 h1:tB4tNw83KcajNAzaIMhkhVI2Nt8fAZd5A5ro113FEMY= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.7/go.mod h1:lvpyBGkZ3tZ9iSsUIcC2EWp+0ywa7aK3BLT+FwZi+mQ= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.2.1/go.mod h1:zceowr5Z1Nh2WVP8bf/3ikB41IZW59E4yIYbg+pC6mw= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.7 h1:8eUsivBQzZHqe/3FE+cqwfH+0p5Jo8PFM/QYQSmeZ+M= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.7/go.mod h1:kLPQvGUmxn/fqiCrDeohwG33bq2pQpGeY62yRO6Nrh0= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.5.1/go.mod h1:6EQZIwNNvHpq/2/QSJnp4+ECvqIy55w95Ofs0ze+nGQ= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.6 h1:BbGDtTi0T1DYlmjBiCr/le3wzhA37O8QTC5/Ab8+EXk= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.6/go.mod h1:hLMJt7Q8ePgViKupeymbqI0la+t9/iYFBjxQCFwuAwI= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.7 h1:Hi0KGbrnr57bEHWM0bJ1QcBzxLrL/k2DHvGYhb8+W1w= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.7/go.mod h1:wKNgWgExdjjrm4qvfbTorkvocEstaoDl4WCvGfeCy9c= github.com/aws/aws-sdk-go-v2/service/s3 v1.11.1/go.mod h1:XLAGFrEjbvMCLvAtWLLP32yTv8GpBquCApZEycDLunI= -github.com/aws/aws-sdk-go-v2/service/s3 v1.71.0 h1:nyuzXooUNJexRT0Oy0UQY6AhOzxPxhtt4DcBIHyCnmw= -github.com/aws/aws-sdk-go-v2/service/s3 v1.71.0/go.mod h1:sT/iQz8JK3u/5gZkT+Hmr7GzVZehUMkRZpOaAwYXeGY= +github.com/aws/aws-sdk-go-v2/service/s3 v1.72.0 h1:SAfh4pNx5LuTafKKWR02Y+hL3A+3TX8cTKG1OIAJaBk= +github.com/aws/aws-sdk-go-v2/service/s3 v1.72.0/go.mod h1:r+xl5yzMk9083rMR+sJ5TYj9Tihvf/l1oxzZXDgGj2Q= github.com/aws/aws-sdk-go-v2/service/sso v1.3.1/go.mod h1:J3A3RGUvuCZjvSuZEcOpHDnzZP/sKbhDWV2T1EOzFIM= github.com/aws/aws-sdk-go-v2/service/sso v1.24.8 h1:CvuUmnXI7ebaUAhbJcDy9YQx8wHR69eZ9I7q5hszt/g= github.com/aws/aws-sdk-go-v2/service/sso v1.24.8/go.mod h1:XDeGv1opzwm8ubxddF0cgqkZWsyOtw4lr6dxwmb6YQg= From b3e7f55762bcb7a0829876c43bd6471dc77c593d Mon Sep 17 00:00:00 2001 From: Michel Davit Date: Mon, 6 Jan 2025 16:07:09 +0100 Subject: [PATCH 16/43] [java] Fix avro bytes conversion for BQ storage (#33412) Avro byte is represented with java ByteBuffer. Current implementation expects a byte array. This fails in practice with a ClassCastException: class java.nio.HeapByteBuffer cannot be cast to class [B --- .../io/gcp/bigquery/AvroGenericRecordToStorageApiProto.java | 2 +- .../gcp/bigquery/AvroGenericRecordToStorageApiProtoTest.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AvroGenericRecordToStorageApiProto.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AvroGenericRecordToStorageApiProto.java index 0b7e17b89090..c721395eec79 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AvroGenericRecordToStorageApiProto.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AvroGenericRecordToStorageApiProto.java @@ -86,7 +86,7 @@ public class AvroGenericRecordToStorageApiProto { .put(Schema.Type.STRING, Object::toString) .put(Schema.Type.BOOLEAN, Function.identity()) .put(Schema.Type.ENUM, o -> o.toString()) - .put(Schema.Type.BYTES, o -> ByteString.copyFrom((byte[]) o)) + .put(Schema.Type.BYTES, o -> ByteString.copyFrom(((ByteBuffer) o).duplicate())) .build(); // A map of supported logical types to their encoding functions. diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/AvroGenericRecordToStorageApiProtoTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/AvroGenericRecordToStorageApiProtoTest.java index 6a59afeed823..472173c67412 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/AvroGenericRecordToStorageApiProtoTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/AvroGenericRecordToStorageApiProtoTest.java @@ -28,6 +28,7 @@ import com.google.protobuf.Descriptors; import com.google.protobuf.DynamicMessage; import java.math.BigDecimal; +import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -308,7 +309,7 @@ enum TestEnum { Instant now = Instant.now(); baseRecord = new GenericRecordBuilder(BASE_SCHEMA) - .set("bytesValue", BYTES) + .set("bytesValue", ByteBuffer.wrap(BYTES)) .set("intValue", (int) 3) .set("longValue", (long) 4) .set("floatValue", (float) 3.14) From 28999bc3b2b118a82ccc72a7e4ffc3d6825fa464 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Mon, 6 Jan 2025 12:04:50 -0500 Subject: [PATCH 17/43] Fix republish docker branch (#33501) * Fix republish docker branch * Don't require version/rc so that workflow can be tested without them * Remove default --- .../workflows/republish_released_docker_containers.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/republish_released_docker_containers.yml b/.github/workflows/republish_released_docker_containers.yml index 3c1ff02417cb..5ab38cff10fc 100644 --- a/.github/workflows/republish_released_docker_containers.yml +++ b/.github/workflows/republish_released_docker_containers.yml @@ -24,18 +24,16 @@ on: inputs: RELEASE: description: Beam version of current release (e.g. 2.XX.0) - required: true - default: '2.61.0' + required: false RC: description: Integer RC version for the release (e.g. 3 for RC3) - required: true - default: '3' + required: false schedule: - cron: "0 6 * * 1" env: docker_registry: gcr.io - release: "${{ github.event.inputs.RELEASE }}" - rc: "${{ github.event.inputs.RC}}" + release: "${{ github.event.inputs.RELEASE || '2.61.0' }}" + rc: "${{ github.event.inputs.RC || '3' }}" jobs: From 254c1e2659cb5942014c25bbba165552f013cfb9 Mon Sep 17 00:00:00 2001 From: Andrew Crites Date: Mon, 6 Jan 2025 09:16:24 -0800 Subject: [PATCH 18/43] Adds references to KeyCommitTooLargeException docs whever we throw the exception and also mentions --experiments=throw_exceptions_on_large_output as an option. (#33380) --- .../runners/dataflow/worker/OutputTooLargeException.java | 4 +++- .../apache/beam/runners/dataflow/worker/WindmillSink.java | 8 ++++++-- .../worker/streaming/KeyCommitTooLargeException.java | 3 ++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OutputTooLargeException.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OutputTooLargeException.java index 9f4b413841c5..acfd8a291108 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OutputTooLargeException.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OutputTooLargeException.java @@ -22,7 +22,9 @@ /** Indicates that an output element was too large. */ public class OutputTooLargeException extends RuntimeException { public OutputTooLargeException(String reason) { - super(reason); + super( + reason + + " See https://cloud.google.com/dataflow/docs/guides/common-errors#key-commit-too-large-exception."); } /** Returns whether an exception was caused by a {@link OutputTooLargeException}. */ diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillSink.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillSink.java index 78d0c6b4550a..f83c68ab3c90 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillSink.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillSink.java @@ -183,7 +183,9 @@ public long add(WindowedValue data) throws IOException { "Trying to output too large key with size " + key.size() + ". Limit is " - + context.getMaxOutputKeyBytes()); + + context.getMaxOutputKeyBytes() + + ". See https://cloud.google.com/dataflow/docs/guides/common-errors#key-commit-too-large-exception." + + " Running with --experiments=throw_exceptions_on_large_output will instead throw an OutputTooLargeException which may be caught in user code."); } } if (value.size() > context.getMaxOutputValueBytes()) { @@ -194,7 +196,9 @@ public long add(WindowedValue data) throws IOException { "Trying to output too large value with size " + value.size() + ". Limit is " - + context.getMaxOutputValueBytes()); + + context.getMaxOutputValueBytes() + + ". See https://cloud.google.com/dataflow/docs/guides/common-errors#key-commit-too-large-exception." + + " Running with --experiments=throw_exceptions_on_large_output will instead throw an OutputTooLargeException which may be caught in user code."); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/KeyCommitTooLargeException.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/KeyCommitTooLargeException.java index 090d9981309e..76228b9092b3 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/KeyCommitTooLargeException.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/KeyCommitTooLargeException.java @@ -40,7 +40,8 @@ public static KeyCommitTooLargeException causedBy( message.append( ". This may be caused by grouping a very " + "large amount of data in a single window without using Combine," - + " or by producing a large amount of data from a single input element."); + + " or by producing a large amount of data from a single input element." + + " See https://cloud.google.com/dataflow/docs/guides/common-errors#key-commit-too-large-exception."); return new KeyCommitTooLargeException(message.toString()); } From a1df014012757cc6d35290345580ea418c564907 Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Mon, 6 Jan 2025 11:08:14 -0800 Subject: [PATCH 19/43] [#31912] Fix bug in finalization for Prism & fix the Java bundle finalizing tests. (#33493) --- runners/prism/java/build.gradle | 8 ------- .../prism/internal/engine/elementmanager.go | 23 +------------------ .../prism/internal/engine/teststream.go | 4 +--- .../beam/runners/prism/internal/execute.go | 1 - .../beam/runners/prism/internal/preprocess.go | 4 +++- .../apache/beam/sdk/transforms/ParDoTest.java | 20 +++++++++++++--- 6 files changed, 22 insertions(+), 38 deletions(-) diff --git a/runners/prism/java/build.gradle b/runners/prism/java/build.gradle index cd2e90fde67c..a48973f65674 100644 --- a/runners/prism/java/build.gradle +++ b/runners/prism/java/build.gradle @@ -187,14 +187,6 @@ def sickbayTests = [ // Missing output due to processing time timer skew. 'org.apache.beam.sdk.transforms.ParDoTest$TimestampTests.testProcessElementSkew', - // TestStream + BundleFinalization. - // Tests seem to assume individual element bundles from test stream, but prism will aggregate them, preventing - // a subsequent firing. Tests ultimately hang until timeout. - // Either a test problem, or a misunderstanding of how test stream must work problem in prism. - // Biased to test problem, due to how they are constructed. - 'org.apache.beam.sdk.transforms.ParDoTest$BundleFinalizationTests.testBundleFinalization', - 'org.apache.beam.sdk.transforms.ParDoTest$BundleFinalizationTests.testBundleFinalizationWithSideInputs', - // Filtered by PortableRunner tests. // Teardown not called in exceptions // https://github.com/apache/beam/issues/20372 diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go index 7180bb456f1a..3cfcf9ef8c0e 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go @@ -187,8 +187,7 @@ type ElementManager struct { config Config nextBundID func() string // Generates unique bundleIDs. Set in the Bundles method. - impulses set[string] // List of impulse stages. - stages map[string]*stageState // The state for each stage. + stages map[string]*stageState // The state for each stage. consumers map[string][]string // Map from pcollectionID to stageIDs that consumes them as primary input. sideConsumers map[string][]LinkID // Map from pcollectionID to the stage+transform+input that consumes them as side input. @@ -250,7 +249,6 @@ func (em *ElementManager) AddStage(ID string, inputIDs, outputIDs []string, side // so we must do it here. if len(inputIDs) == 0 { refreshes := singleSet(ss.ID) - em.addToTestStreamImpulseSet(refreshes) em.markStagesAsChanged(refreshes) } @@ -319,22 +317,9 @@ func (em *ElementManager) Impulse(stageID string) { em.addPending(count) } refreshes := stage.updateWatermarks(em) - - em.addToTestStreamImpulseSet(refreshes) em.markStagesAsChanged(refreshes) } -// addToTestStreamImpulseSet adds to the set of stages to refresh on pipeline start. -// We keep this separate since impulses are synthetic. In a test stream driven pipeline -// these will need to be stimulated separately, to ensure the test stream has progressed. -func (em *ElementManager) addToTestStreamImpulseSet(refreshes set[string]) { - if em.impulses == nil { - em.impulses = refreshes - } else { - em.impulses.merge(refreshes) - } -} - type RunBundle struct { StageID string BundleID string @@ -373,12 +358,6 @@ func (em *ElementManager) Bundles(ctx context.Context, upstreamCancelFn context. }() defer close(runStageCh) - // If we have a test stream, clear out existing changed stages, - // so the test stream can insert any elements it needs. - if em.testStreamHandler != nil { - em.changedStages = singleSet(em.testStreamHandler.ID) - } - for { em.refreshCond.L.Lock() // Check if processing time has advanced before the wait loop. diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/teststream.go b/sdks/go/pkg/beam/runners/prism/internal/engine/teststream.go index ed3df75fd8ef..533cd5a0fc40 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/teststream.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/teststream.go @@ -142,10 +142,8 @@ func (ts *testStreamHandler) UpdateHold(em *ElementManager, newHold mtime.Time) ts.currentHold = newHold ss.watermarkHolds.Add(ts.currentHold, 1) - // kick the TestStream and Impulse stages too. + // kick the TestStream stage to ensure downstream watermark propagation. kick := singleSet(ts.ID) - kick.merge(em.impulses) - // This executes under the refreshCond lock, so we can't call em.addRefreshes. em.changedStages.merge(kick) em.refreshCond.Broadcast() diff --git a/sdks/go/pkg/beam/runners/prism/internal/execute.go b/sdks/go/pkg/beam/runners/prism/internal/execute.go index 2cc62769d2a9..d41c3cd9c75c 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/execute.go +++ b/sdks/go/pkg/beam/runners/prism/internal/execute.go @@ -269,7 +269,6 @@ func executePipeline(ctx context.Context, wks map[string]*worker.W, j *jobservic elms = append(elms, engine.TestStreamElement{Encoded: mayLP(e.GetEncodedElement()), EventTime: mtime.Time(e.GetTimestamp())}) } tsb.AddElementEvent(ev.ElementEvent.GetTag(), elms) - ev.ElementEvent.GetTag() case *pipepb.TestStreamPayload_Event_WatermarkEvent: tsb.AddWatermarkEvent(ev.WatermarkEvent.GetTag(), mtime.Time(ev.WatermarkEvent.GetNewWatermark())) case *pipepb.TestStreamPayload_Event_ProcessingTimeEvent: diff --git a/sdks/go/pkg/beam/runners/prism/internal/preprocess.go b/sdks/go/pkg/beam/runners/prism/internal/preprocess.go index 0d3ec7c365c1..2048d32e4ad4 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/preprocess.go +++ b/sdks/go/pkg/beam/runners/prism/internal/preprocess.go @@ -445,7 +445,9 @@ func finalizeStage(stg *stage, comps *pipepb.Components, pipelineFacts *fusionFa if err := (proto.UnmarshalOptions{}).Unmarshal(t.GetSpec().GetPayload(), pardo); err != nil { return fmt.Errorf("unable to decode ParDoPayload for %v", link.Transform) } - stg.finalize = pardo.RequestsFinalization + if pardo.GetRequestsFinalization() { + stg.finalize = true + } if len(pardo.GetTimerFamilySpecs())+len(pardo.GetStateSpecs())+len(pardo.GetOnWindowExpirationTimerFamilySpec()) > 0 { stg.stateful = true } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java index fb2321328b32..742547b9b6c3 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java @@ -1593,7 +1593,7 @@ public void populateDisplayData(DisplayData.Builder builder) { @RunWith(JUnit4.class) public static class BundleFinalizationTests extends SharedTestBase implements Serializable { private abstract static class BundleFinalizingDoFn extends DoFn, String> { - private static final long MAX_ATTEMPTS = 3000; + private static final long MAX_ATTEMPTS = 100; // We use the UUID to uniquely identify this DoFn in case this test is run with // other tests in the same JVM. private static final Map WAS_FINALIZED = new HashMap(); @@ -1637,9 +1637,15 @@ public Void apply(Iterable input) { public void testBundleFinalization() { TestStream.Builder> stream = TestStream.create(KvCoder.of(StringUtf8Coder.of(), VarLongCoder.of())); - for (long i = 0; i < BundleFinalizingDoFn.MAX_ATTEMPTS; ++i) { + long attemptCap = BundleFinalizingDoFn.MAX_ATTEMPTS - 1; + for (long i = 0; i < attemptCap; ++i) { stream = stream.addElements(KV.of("key" + (i % 10), i)); } + // Advance the time, and add the final element. This allows Finalization + // check mechanism to work without being sensitive to how bundles are + // produced by a runner. + stream = stream.advanceWatermarkTo(new Instant(10)); + stream = stream.addElements(KV.of("key" + (attemptCap % 10), attemptCap)); PCollection output = pipeline .apply(stream.advanceWatermarkToInfinity()) @@ -1677,6 +1683,8 @@ public void testBundleFinalizationWithState() { for (long i = 0; i < BundleFinalizingDoFn.MAX_ATTEMPTS; ++i) { stream = stream.addElements(KV.of("key" + (i % 10), i)); } + // Stateful execution is already per-key, so it is unnecessary to add a + // "final" element to attempt additional bundles to validate finalization. PCollection output = pipeline .apply(stream.advanceWatermarkToInfinity()) @@ -1715,9 +1723,15 @@ public void processElement( public void testBundleFinalizationWithSideInputs() { TestStream.Builder> stream = TestStream.create(KvCoder.of(StringUtf8Coder.of(), VarLongCoder.of())); - for (long i = 0; i < BundleFinalizingDoFn.MAX_ATTEMPTS; ++i) { + long attemptCap = BundleFinalizingDoFn.MAX_ATTEMPTS - 1; + for (long i = 0; i < attemptCap; ++i) { stream = stream.addElements(KV.of("key" + (i % 10), i)); } + // Advance the time, and add the final element. This allows Finalization + // check mechanism to work without being sensitive to how bundles are + // produced by a runner. + stream = stream.advanceWatermarkTo(GlobalWindow.INSTANCE.maxTimestamp()); + stream = stream.addElements(KV.of("key" + (attemptCap % 10), attemptCap)); PCollectionView sideInput = pipeline.apply(Create.of("sideInput value")).apply(View.asSingleton()); PCollection output = From bbccf52f706aba0c31ed9c1fe671151b785f0b79 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Mon, 6 Jan 2025 14:42:57 -0500 Subject: [PATCH 20/43] Build wheels on gha runners instead of local runners (#33505) * Debug wheels * Try gha runners * Add back other workflows * Update build_wheels.yml --- .github/workflows/build_wheels.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 828a6328c0cd..20706e77d0cd 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -219,6 +219,7 @@ jobs: runs-on: ${{ matrix.os_python.runner }} timeout-minutes: 480 strategy: + fail-fast: false matrix: os_python: [ {"os": "ubuntu-20.04", "runner": [self-hosted, ubuntu-20.04, main], "python": "${{ needs.check_env_variables.outputs.py-versions-full }}", arch: "auto" }, @@ -226,7 +227,7 @@ jobs: # TODO(https://github.com/apache/beam/issues/31114) {"os": "macos-13", "runner": "macos-13", "python": "${{ needs.check_env_variables.outputs.py-versions-test }}", arch: "auto" }, {"os": "windows-latest", "runner": "windows-latest", "python": "${{ needs.check_env_variables.outputs.py-versions-test }}", arch: "auto" }, - {"os": "ubuntu-20.04", "runner": [self-hosted, ubuntu-20.04, main], "python": "${{ needs.check_env_variables.outputs.py-versions-test }}", arch: "aarch64" } + {"os": "ubuntu-20.04", "runner": "ubuntu-latest", "python": "${{ needs.check_env_variables.outputs.py-versions-test }}", arch: "aarch64" } ] # Keep in sync (remove asterisks) with PY_VERSIONS_FULL env var above - if changed, change that as well. py_version: ["cp39-", "cp310-", "cp311-", "cp312-"] From d6e0b0c5c0f4cd8acad230833c2ed42e220d6178 Mon Sep 17 00:00:00 2001 From: Filipe Regadas Date: Mon, 6 Jan 2025 14:52:22 -0500 Subject: [PATCH 21/43] Add support for Iceberg table identifiers with special characters (#33293) * Add support for Iceberg table identifiers with special characters * Add used undeclared dependencies * Fix style * Trigger iceberg integration tests --- .../IO_Iceberg_Integration_Tests.json | 2 +- sdks/java/io/iceberg/build.gradle | 4 +- .../sdk/io/iceberg/AppendFilesToTables.java | 3 +- .../beam/sdk/io/iceberg/FileWriteResult.java | 5 ++- .../IcebergReadSchemaTransformProvider.java | 3 +- .../sdk/io/iceberg/IcebergScanConfig.java | 7 +++- .../beam/sdk/io/iceberg/IcebergUtils.java | 17 ++++++++ .../iceberg/OneTableDynamicDestinations.java | 4 +- .../iceberg/PortableIcebergDestinations.java | 3 +- .../sdk/io/iceberg/IcebergIOReadTest.java | 24 +++++++++-- .../beam/sdk/io/iceberg/IcebergUtilsTest.java | 40 +++++++++++++++++++ 11 files changed, 94 insertions(+), 18 deletions(-) diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json index 3f63c0c9975f..bbdc3a3910ef 100644 --- a/.github/trigger_files/IO_Iceberg_Integration_Tests.json +++ b/.github/trigger_files/IO_Iceberg_Integration_Tests.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 2 + "modification": 3 } diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle index 147c49b21c65..f7a9e5c8533d 100644 --- a/sdks/java/io/iceberg/build.gradle +++ b/sdks/java/io/iceberg/build.gradle @@ -55,8 +55,10 @@ dependencies { implementation "org.apache.iceberg:iceberg-api:$iceberg_version" implementation "org.apache.iceberg:iceberg-parquet:$iceberg_version" implementation "org.apache.iceberg:iceberg-orc:$iceberg_version" - implementation library.java.hadoop_common runtimeOnly "org.apache.iceberg:iceberg-gcp:$iceberg_version" + implementation library.java.hadoop_common + implementation library.java.jackson_core + implementation library.java.jackson_databind testImplementation project(":sdks:java:managed") testImplementation library.java.hadoop_client diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java index d9768114e7c6..deec779c6cc9 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java @@ -47,7 +47,6 @@ import org.apache.iceberg.Snapshot; import org.apache.iceberg.Table; import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.OutputFile; import org.checkerframework.checker.nullness.qual.MonotonicNonNull; @@ -134,7 +133,7 @@ public void processElement( return; } - Table table = getCatalog().loadTable(TableIdentifier.parse(element.getKey())); + Table table = getCatalog().loadTable(IcebergUtils.parseTableIdentifier(element.getKey())); // vast majority of the time, we will simply append data files. // in the rare case we get a batch that contains multiple partition specs, we will group diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/FileWriteResult.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/FileWriteResult.java index bf00bf8519fc..d58ac8696d37 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/FileWriteResult.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/FileWriteResult.java @@ -25,6 +25,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.catalog.TableIdentifierParser; import org.checkerframework.checker.nullness.qual.MonotonicNonNull; @AutoValue @@ -41,7 +42,7 @@ abstract class FileWriteResult { @SchemaIgnore public TableIdentifier getTableIdentifier() { if (cachedTableIdentifier == null) { - cachedTableIdentifier = TableIdentifier.parse(getTableIdentifierString()); + cachedTableIdentifier = IcebergUtils.parseTableIdentifier(getTableIdentifierString()); } return cachedTableIdentifier; } @@ -67,7 +68,7 @@ abstract static class Builder { @SchemaIgnore public Builder setTableIdentifier(TableIdentifier tableId) { - return setTableIdentifierString(tableId.toString()); + return setTableIdentifierString(TableIdentifierParser.toJson(tableId)); } public abstract FileWriteResult build(); diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProvider.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProvider.java index d44149fda08e..951442e2c95f 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProvider.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProvider.java @@ -31,7 +31,6 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; -import org.apache.iceberg.catalog.TableIdentifier; /** * SchemaTransform implementation for {@link IcebergIO#readRows}. Reads records from Iceberg and @@ -86,7 +85,7 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { .getPipeline() .apply( IcebergIO.readRows(configuration.getIcebergCatalog()) - .from(TableIdentifier.parse(configuration.getTable()))); + .from(IcebergUtils.parseTableIdentifier(configuration.getTable()))); return PCollectionRowTuple.of(OUTPUT_TAG, output); } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java index 60372b172af7..640283d83c2e 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java @@ -23,6 +23,7 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.iceberg.Table; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.catalog.TableIdentifierParser; import org.apache.iceberg.expressions.Expression; import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.checkerframework.checker.nullness.qual.Nullable; @@ -51,7 +52,9 @@ public enum ScanType { public Table getTable() { if (cachedTable == null) { cachedTable = - getCatalogConfig().catalog().loadTable(TableIdentifier.parse(getTableIdentifier())); + getCatalogConfig() + .catalog() + .loadTable(IcebergUtils.parseTableIdentifier(getTableIdentifier())); } return cachedTable; } @@ -126,7 +129,7 @@ public abstract static class Builder { public abstract Builder setTableIdentifier(String tableIdentifier); public Builder setTableIdentifier(TableIdentifier tableIdentifier) { - return this.setTableIdentifier(tableIdentifier.toString()); + return this.setTableIdentifier(TableIdentifierParser.toJson(tableIdentifier)); } public Builder setTableIdentifier(String... names) { diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java index ef19a5881366..bd2f743172dc 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java @@ -19,6 +19,9 @@ import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import java.nio.ByteBuffer; import java.time.LocalDate; import java.time.LocalDateTime; @@ -36,6 +39,8 @@ import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.catalog.TableIdentifierParser; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.types.Type; @@ -47,6 +52,9 @@ /** Utilities for converting between Beam and Iceberg types, made public for user's convenience. */ public class IcebergUtils { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private IcebergUtils() {} private static final Map BEAM_TYPES_TO_ICEBERG_TYPES = @@ -506,4 +514,13 @@ private static Object getLogicalTypeValue(Object icebergValue, Schema.FieldType // LocalDateTime, LocalDate, LocalTime return icebergValue; } + + public static TableIdentifier parseTableIdentifier(String table) { + try { + JsonNode jsonNode = OBJECT_MAPPER.readTree(table); + return TableIdentifierParser.fromJson(jsonNode); + } catch (JsonProcessingException e) { + return TableIdentifier.parse(table); + } + } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/OneTableDynamicDestinations.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/OneTableDynamicDestinations.java index 861a8ad198a8..be810aa20a13 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/OneTableDynamicDestinations.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/OneTableDynamicDestinations.java @@ -41,7 +41,7 @@ class OneTableDynamicDestinations implements DynamicDestinations, Externalizable @VisibleForTesting TableIdentifier getTableIdentifier() { if (tableId == null) { - tableId = TableIdentifier.parse(checkStateNotNull(tableIdString)); + tableId = IcebergUtils.parseTableIdentifier(checkStateNotNull(tableIdString)); } return tableId; } @@ -86,6 +86,6 @@ public void writeExternal(ObjectOutput out) throws IOException { @Override public void readExternal(ObjectInput in) throws IOException { tableIdString = in.readUTF(); - tableId = TableIdentifier.parse(tableIdString); + tableId = IcebergUtils.parseTableIdentifier(tableIdString); } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PortableIcebergDestinations.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PortableIcebergDestinations.java index 47f661bba3f8..58f70463bc76 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PortableIcebergDestinations.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PortableIcebergDestinations.java @@ -24,7 +24,6 @@ import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.ValueInSingleWindow; import org.apache.iceberg.FileFormat; -import org.apache.iceberg.catalog.TableIdentifier; import org.checkerframework.checker.nullness.qual.Nullable; class PortableIcebergDestinations implements DynamicDestinations { @@ -73,7 +72,7 @@ public String getTableStringIdentifier(ValueInSingleWindow element) { @Override public IcebergDestination instantiateDestination(String dest) { return IcebergDestination.builder() - .setTableIdentifier(TableIdentifier.parse(dest)) + .setTableIdentifier(IcebergUtils.parseTableIdentifier(dest)) .setTableCreateConfig(null) .setFileFormat(FileFormat.fromString(fileFormat)) .build(); diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java index 39c621975547..6ff3bdf6a4ff 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java @@ -21,6 +21,8 @@ import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.containsInAnyOrder; +import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.UUID; @@ -46,11 +48,11 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; +import org.junit.runners.Parameterized; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -@RunWith(JUnit4.class) +@RunWith(Parameterized.class) public class IcebergIOReadTest { private static final Logger LOG = LoggerFactory.getLogger(IcebergIOReadTest.class); @@ -61,6 +63,21 @@ public class IcebergIOReadTest { @Rule public TestPipeline testPipeline = TestPipeline.create(); + @Parameterized.Parameters + public static Collection data() { + return Arrays.asList( + new Object[][] { + {String.format("{\"namespace\": [\"default\"], \"name\": \"%s\"}", tableId())}, + {String.format("default.%s", tableId())}, + }); + } + + public static String tableId() { + return "table" + Long.toString(UUID.randomUUID().hashCode(), 16); + } + + @Parameterized.Parameter public String tableStringIdentifier; + static class PrintRow extends DoFn { @ProcessElement @@ -72,8 +89,7 @@ public void process(@Element Row row, OutputReceiver output) throws Excepti @Test public void testSimpleScan() throws Exception { - TableIdentifier tableId = - TableIdentifier.of("default", "table" + Long.toString(UUID.randomUUID().hashCode(), 16)); + TableIdentifier tableId = IcebergUtils.parseTableIdentifier(tableStringIdentifier); Table simpleTable = warehouse.createTable(tableId, TestFixtures.SCHEMA); final Schema schema = IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA); diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java index 134f05c34bfb..918c6b1146ee 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java @@ -19,11 +19,13 @@ import static org.apache.beam.sdk.io.iceberg.IcebergUtils.TypeAndMaxId; import static org.apache.beam.sdk.io.iceberg.IcebergUtils.beamFieldTypeToIcebergFieldType; +import static org.apache.beam.sdk.io.iceberg.IcebergUtils.parseTableIdentifier; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.equalTo; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; import java.math.BigDecimal; @@ -32,6 +34,7 @@ import java.time.OffsetDateTime; import java.time.ZoneOffset; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Map; import org.apache.beam.sdk.schemas.Schema; @@ -49,6 +52,7 @@ import org.junit.experimental.runners.Enclosed; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; +import org.junit.runners.Parameterized; /** Test class for {@link IcebergUtils}. */ @RunWith(Enclosed.class) @@ -802,4 +806,40 @@ public void testStructIcebergSchemaToBeamSchema() { assertEquals(BEAM_SCHEMA_STRUCT, convertedBeamSchema); } } + + @RunWith(Parameterized.class) + public static class TableIdentifierParseTests { + + @Parameterized.Parameters + public static Collection data() { + return Arrays.asList( + new Object[][] { + { + "{\"namespace\": [\"dogs\", \"owners.and.handlers\"], \"name\": \"food\"}", + "dogs.owners.and.handlers.food", + true + }, + {"dogs.owners.and.handlers.food", "dogs.owners.and.handlers.food", true}, + {"{\"name\": \"food\"}", "food", true}, + {"{\"table_name\": \"food\"}", "{\"table_name\": \"food\"}", false}, + }); + } + + @Parameterized.Parameter public String input; + + @Parameterized.Parameter(1) + public String expected; + + @Parameterized.Parameter(2) + public boolean shouldSucceed; + + @Test + public void test() { + if (shouldSucceed) { + assertEquals(expected, parseTableIdentifier(input).toString()); + } else { + assertThrows(IllegalArgumentException.class, () -> parseTableIdentifier(input)); + } + } + } } From e0b04bf27329d30a2404abe429cda824df9b052a Mon Sep 17 00:00:00 2001 From: liferoad Date: Mon, 6 Jan 2025 15:41:00 -0500 Subject: [PATCH 22/43] Update 2024.md (#33502) --- contributor-docs/discussion-docs/2024.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/contributor-docs/discussion-docs/2024.md b/contributor-docs/discussion-docs/2024.md index 124fe8ef9bb7..a5e1202997ed 100644 --- a/contributor-docs/discussion-docs/2024.md +++ b/contributor-docs/discussion-docs/2024.md @@ -44,3 +44,10 @@ limitations under the License. | 27 | Ahmed Abualsaud | [Python Multi-language with SchemaTransforms](https://docs.google.com/document/d/1_embA3pGwoYG7sbHaYzAkg3hNxjTughhFCY8ThcoK_Q) | 2024-08-26 19:53:10 | | 28 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - September 2024](https://s.apache.org/beam-draft-report-2024-09) | 2024-09-11 15:01:55 | | 29 | Jeff Kinard | [Beam YA(ML)^2](https://docs.google.com/document/d/1z9lNlSBfqDVdOP1frJNv_NJoMR1F1VBI29wn788x6IE/) | 2024-09-11 15:01:55 | +| 30 | Ahmed Abualsaud | [Beam Dynamic Destinations Naming](https://docs.google.com/document/d/1IIn4cjF9eYASnjSmVmmAt6ymFnpBxHgBKVPgpnQ12G4) | 2024-09-18 15:01:55 | +| 31 | Claude van der Merwe | [RAG with Apache Beam ](https://docs.google.com/document/d/1j-kujrxHw4R3-oT4pVAwEIqejoCXhFedqZnBUF8AKBQ) | 2024-11-08 16:37:00 | +| 32 | Shunping Huang | [Anomaly Detection with Beam](https://docs.google.com/document/d/1tE8lz9U_vjlNn2H7t-GRrs3vfhQ5UuCgWiHXCRHRPns) | 2024-12-13 10:37:00 | +| 33 | Radek Stankiewicz | [Kerberized Worker Harness](https://docs.google.com/document/d/1T3Py6VZhP-FNQMjiURj38ddZyhWQRa_vDEUEc4f1P5A) | 2024-12-16 07:27:00 | + + + From e657d6c8cec8ee4d52566fee32ce752bb5798f69 Mon Sep 17 00:00:00 2001 From: Tim Heckman <60020185+t2h6@users.noreply.github.com> Date: Tue, 7 Jan 2025 02:42:23 +0000 Subject: [PATCH 23/43] Adds an SdkHarnessOption that controls whether logging is redirected through the FnApi (#33418) * Add an SdkHarnessOption that controls whether logging is redirected through the FnApi logging service. Redirection through the logging service is enabled by default. * Add an SdkHarnessOption that controls whether logging is redirected through the FnApi logging service. Redirection through the FnApi is enabled by default. * include license in new files * fix ManagedChannel dep * fix invalid conversions * fix missing imports * fix type mismatch * fix up tests * continue to use anyOf when logViaFnApi is enabled * More comments on the new SdkHarnessOption. DataflowRunner.run() forces the option to 'enabled'. --- .../beam/runners/dataflow/DataflowRunner.java | 3 + .../beam/sdk/options/SdkHarnessOptions.java | 10 ++++ .../logging/BeamFnLoggingClientBenchmark.java | 6 +- .../org/apache/beam/fn/harness/FnHarness.java | 14 +++-- .../harness/logging/BeamFnLoggingClient.java | 5 +- .../fn/harness/logging/LoggingClient.java | 25 ++++++++ .../harness/logging/LoggingClientFactory.java | 59 +++++++++++++++++++ .../data/PCollectionConsumerRegistryTest.java | 7 ++- 8 files changed, 118 insertions(+), 11 deletions(-) create mode 100644 sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/LoggingClient.java create mode 100644 sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/LoggingClientFactory.java diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java index ce99958c57fd..9ca6e95ed95a 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java @@ -103,6 +103,7 @@ import org.apache.beam.sdk.options.ExperimentalOptions; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsValidator; +import org.apache.beam.sdk.options.SdkHarnessOptions; import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider; import org.apache.beam.sdk.runners.AppliedPTransform; import org.apache.beam.sdk.runners.PTransformOverride; @@ -1252,6 +1253,8 @@ public DataflowPipelineJob run(Pipeline pipeline) { experiments.add("use_portable_job_submission"); } options.setExperiments(ImmutableList.copyOf(experiments)); + // Ensure that logging via the FnApi is enabled + options.as(SdkHarnessOptions.class).setEnableLogViaFnApi(true); } logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/options/SdkHarnessOptions.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/options/SdkHarnessOptions.java index 78ea34503e54..2981046a0a41 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/options/SdkHarnessOptions.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/options/SdkHarnessOptions.java @@ -110,6 +110,16 @@ enum LogLevel { void setLogMdc(boolean value); + /** This option controls whether logging will be redirected through the FnApi. */ + @Description( + "Controls whether logging will be redirected through the FnApi. In normal usage, setting " + + "this to a non-default value will cause log messages to be dropped.") + @Default.Boolean(true) + @Hidden + boolean getEnableLogViaFnApi(); + + void setEnableLogViaFnApi(boolean enableLogViaFnApi); + /** * Size (in MB) of each grouping table used to pre-combine elements. Larger values may reduce the * amount of data shuffled. If unset, defaults to 100 MB. diff --git a/sdks/java/harness/jmh/src/main/java/org/apache/beam/fn/harness/jmh/logging/BeamFnLoggingClientBenchmark.java b/sdks/java/harness/jmh/src/main/java/org/apache/beam/fn/harness/jmh/logging/BeamFnLoggingClientBenchmark.java index b9e4b20db00b..745e1f078646 100644 --- a/sdks/java/harness/jmh/src/main/java/org/apache/beam/fn/harness/jmh/logging/BeamFnLoggingClientBenchmark.java +++ b/sdks/java/harness/jmh/src/main/java/org/apache/beam/fn/harness/jmh/logging/BeamFnLoggingClientBenchmark.java @@ -24,6 +24,8 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.beam.fn.harness.logging.BeamFnLoggingClient; import org.apache.beam.fn.harness.logging.BeamFnLoggingMDC; +import org.apache.beam.fn.harness.logging.LoggingClient; +import org.apache.beam.fn.harness.logging.LoggingClientFactory; import org.apache.beam.fn.harness.logging.QuotaEvent; import org.apache.beam.model.fnexecution.v1.BeamFnApi; import org.apache.beam.model.fnexecution.v1.BeamFnLoggingGrpc; @@ -80,7 +82,7 @@ public void onCompleted() { /** Setup a simple logging service and configure the {@link BeamFnLoggingClient}. */ @State(Scope.Benchmark) public static class ManageLoggingClientAndService { - public final BeamFnLoggingClient loggingClient; + public final LoggingClient loggingClient; public final CallCountLoggingService loggingService; public final Server server; @@ -98,7 +100,7 @@ public ManageLoggingClientAndService() { .build(); server.start(); loggingClient = - BeamFnLoggingClient.createAndStart( + LoggingClientFactory.createAndStart( PipelineOptionsFactory.create(), apiServiceDescriptor, managedChannelFactory::forDescriptor); diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnHarness.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnHarness.java index 9df9f12bc52b..3c8784d0ee42 100644 --- a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnHarness.java +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnHarness.java @@ -39,7 +39,8 @@ import org.apache.beam.fn.harness.control.ProcessBundleHandler; import org.apache.beam.fn.harness.data.BeamFnDataGrpcClient; import org.apache.beam.fn.harness.debug.DataSampler; -import org.apache.beam.fn.harness.logging.BeamFnLoggingClient; +import org.apache.beam.fn.harness.logging.LoggingClient; +import org.apache.beam.fn.harness.logging.LoggingClientFactory; import org.apache.beam.fn.harness.state.BeamFnStateGrpcClientCache; import org.apache.beam.fn.harness.status.BeamFnStatusClient; import org.apache.beam.fn.harness.stream.HarnessStreamObserverFactories; @@ -62,6 +63,7 @@ import org.apache.beam.sdk.options.ExecutorOptions; import org.apache.beam.sdk.options.ExperimentalOptions; import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.SdkHarnessOptions; import org.apache.beam.sdk.util.construction.CoderTranslation; import org.apache.beam.sdk.util.construction.PipelineOptionsTranslation; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.TextFormat; @@ -283,8 +285,8 @@ public static void main( // The logging client variable is not used per se, but during its lifetime (until close()) it // intercepts logging and sends it to the logging service. - try (BeamFnLoggingClient logging = - BeamFnLoggingClient.createAndStart( + try (LoggingClient logging = + LoggingClientFactory.createAndStart( options, loggingApiServiceDescriptor, channelFactory::forDescriptor)) { LOG.info("Fn Harness started"); // Register standard file systems. @@ -410,7 +412,11 @@ private BeamFnApi.ProcessBundleDescriptor loadDescriptor(String id) { outboundObserverFactory, executorService, handlers); - CompletableFuture.anyOf(control.terminationFuture(), logging.terminationFuture()).get(); + if (options.as(SdkHarnessOptions.class).getEnableLogViaFnApi()) { + CompletableFuture.anyOf(control.terminationFuture(), logging.terminationFuture()).get(); + } else { + control.terminationFuture().get(); + } if (beamFnStatusClient != null) { beamFnStatusClient.close(); } diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/BeamFnLoggingClient.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/BeamFnLoggingClient.java index 7812d8c0bc30..112104e4d251 100644 --- a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/BeamFnLoggingClient.java +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/BeamFnLoggingClient.java @@ -68,7 +68,7 @@ /** * Configures {@link java.util.logging} to send all {@link LogRecord}s via the Beam Fn Logging API. */ -public class BeamFnLoggingClient implements AutoCloseable { +public class BeamFnLoggingClient implements LoggingClient { private static final String ROOT_LOGGER_NAME = ""; private static final ImmutableMap LOG_LEVEL_MAP = ImmutableMap.builder() @@ -119,7 +119,7 @@ public class BeamFnLoggingClient implements AutoCloseable { */ private @Nullable Thread logEntryHandlerThread = null; - public static BeamFnLoggingClient createAndStart( + static BeamFnLoggingClient createAndStart( PipelineOptions options, Endpoints.ApiServiceDescriptor apiServiceDescriptor, Function channelFactory) { @@ -383,6 +383,7 @@ void flushFinalLogs(@UnderInitialization BeamFnLoggingClient this) { } } + @Override public CompletableFuture terminationFuture() { checkNotNull(bufferedLogConsumer, "BeamFnLoggingClient not fully started"); return bufferedLogConsumer; diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/LoggingClient.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/LoggingClient.java new file mode 100644 index 000000000000..3c1972ec643d --- /dev/null +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/LoggingClient.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.fn.harness.logging; + +import java.util.concurrent.CompletableFuture; + +public interface LoggingClient extends AutoCloseable { + + CompletableFuture terminationFuture(); +} diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/LoggingClientFactory.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/LoggingClientFactory.java new file mode 100644 index 000000000000..f61b8d3c4ba3 --- /dev/null +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/LoggingClientFactory.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.fn.harness.logging; + +import java.util.concurrent.CompletableFuture; +import java.util.function.Function; +import org.apache.beam.model.pipeline.v1.Endpoints; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.SdkHarnessOptions; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; + +/** + * A factory for {@link LoggingClient}s. Provides {@link BeamFnLoggingClient} if the logging service + * is enabled, otherwise provides a no-op client. + */ +public class LoggingClientFactory { + + private LoggingClientFactory() {} + + /** + * A factory for {@link LoggingClient}s. Provides {@link BeamFnLoggingClient} if the logging + * service is enabled, otherwise provides a no-op client. + */ + public static LoggingClient createAndStart( + PipelineOptions options, + Endpoints.ApiServiceDescriptor apiServiceDescriptor, + Function channelFactory) { + if (options.as(SdkHarnessOptions.class).getEnableLogViaFnApi()) { + return BeamFnLoggingClient.createAndStart(options, apiServiceDescriptor, channelFactory); + } else { + return new NoOpLoggingClient(); + } + } + + static final class NoOpLoggingClient implements LoggingClient { + @Override + public CompletableFuture terminationFuture() { + return CompletableFuture.completedFuture(new Object()); + } + + @Override + public void close() throws Exception {} + } +} diff --git a/sdks/java/harness/src/test/java/org/apache/beam/fn/harness/data/PCollectionConsumerRegistryTest.java b/sdks/java/harness/src/test/java/org/apache/beam/fn/harness/data/PCollectionConsumerRegistryTest.java index 245c87f3e194..96e68586b3a5 100644 --- a/sdks/java/harness/src/test/java/org/apache/beam/fn/harness/data/PCollectionConsumerRegistryTest.java +++ b/sdks/java/harness/src/test/java/org/apache/beam/fn/harness/data/PCollectionConsumerRegistryTest.java @@ -47,8 +47,9 @@ import org.apache.beam.fn.harness.control.ExecutionStateSampler; import org.apache.beam.fn.harness.control.ExecutionStateSampler.ExecutionStateTracker; import org.apache.beam.fn.harness.debug.DataSampler; -import org.apache.beam.fn.harness.logging.BeamFnLoggingClient; import org.apache.beam.fn.harness.logging.BeamFnLoggingMDC; +import org.apache.beam.fn.harness.logging.LoggingClient; +import org.apache.beam.fn.harness.logging.LoggingClientFactory; import org.apache.beam.model.fnexecution.v1.BeamFnApi; import org.apache.beam.model.fnexecution.v1.BeamFnApi.ProcessBundleDescriptor; import org.apache.beam.model.fnexecution.v1.BeamFnLoggingGrpc; @@ -647,8 +648,8 @@ public StreamObserver logging( // Start the test within the logging context. This reroutes logging through to the boiler-plate // that was set up // earlier. - try (BeamFnLoggingClient ignored = - BeamFnLoggingClient.createAndStart( + try (LoggingClient ignored = + LoggingClientFactory.createAndStart( PipelineOptionsFactory.create(), apiServiceDescriptor, (Endpoints.ApiServiceDescriptor descriptor) -> channel)) { From 43b2bf7cb205f04d9fd776b662c3d86a3b25475f Mon Sep 17 00:00:00 2001 From: Andrej Galad Date: Tue, 7 Jan 2025 00:39:14 -0800 Subject: [PATCH 24/43] [feat 32473] Added soft deadline logic to Spanner Change Stream IO connector. (#32474) * added explicit restriction interrupter class --- .../action/ChildPartitionsRecordAction.java | 14 +- .../action/DataChangeRecordAction.java | 15 +- .../action/HeartbeatRecordAction.java | 11 +- .../action/QueryChangeStreamAction.java | 25 +++- .../restriction/RestrictionInterrupter.java | 85 ++++++++++++ .../ChildPartitionsRecordActionTest.java | 38 ++++- .../action/DataChangeRecordActionTest.java | 26 +++- .../action/HeartbeatRecordActionTest.java | 38 ++++- .../action/QueryChangeStreamActionTest.java | 130 ++++++++++++++---- .../ReadChangeStreamPartitionDoFnTest.java | 6 +- .../RestrictionInterrupterTest.java | 65 +++++++++ 11 files changed, 409 insertions(+), 44 deletions(-) create mode 100644 sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/restriction/RestrictionInterrupter.java create mode 100644 sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/restriction/RestrictionInterrupterTest.java diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/ChildPartitionsRecordAction.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/ChildPartitionsRecordAction.java index ada794d20c3b..14b6b2e2453a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/ChildPartitionsRecordAction.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/ChildPartitionsRecordAction.java @@ -26,6 +26,7 @@ import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.ChildPartition; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.ChildPartitionsRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.PartitionMetadata; +import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.RestrictionInterrupter; import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.TimestampRange; import org.apache.beam.sdk.transforms.DoFn.ProcessContinuation; import org.apache.beam.sdk.transforms.splittabledofn.ManualWatermarkEstimator; @@ -94,17 +95,21 @@ public class ChildPartitionsRecordAction { * @param record the change stream child partition record received * @param tracker the restriction tracker of the {@link * org.apache.beam.sdk.io.gcp.spanner.changestreams.dofn.ReadChangeStreamPartitionDoFn} SDF + * @param interrupter the restriction interrupter suggesting early termination of the processing * @param watermarkEstimator the watermark estimator of the {@link * org.apache.beam.sdk.io.gcp.spanner.changestreams.dofn.ReadChangeStreamPartitionDoFn} SDF * @return {@link Optional#empty()} if the caller can continue processing more records. A non * empty {@link Optional} with {@link ProcessContinuation#stop()} if this function was unable - * to claim the {@link ChildPartitionsRecord} timestamp + * to claim the {@link ChildPartitionsRecord} timestamp. A non empty {@link Optional} with + * {@link ProcessContinuation#resume()} if this function should commit what has already been + * processed and resume. */ @VisibleForTesting public Optional run( PartitionMetadata partition, ChildPartitionsRecord record, RestrictionTracker tracker, + RestrictionInterrupter interrupter, ManualWatermarkEstimator watermarkEstimator) { final String token = partition.getPartitionToken(); @@ -113,6 +118,13 @@ public Optional run( final Timestamp startTimestamp = record.getStartTimestamp(); final Instant startInstant = new Instant(startTimestamp.toSqlTimestamp().getTime()); + if (interrupter.tryInterrupt(startTimestamp)) { + LOG.debug( + "[{}] Soft deadline reached with child partitions record at {}, rescheduling", + token, + startTimestamp); + return Optional.of(ProcessContinuation.resume()); + } if (!tracker.tryClaim(startTimestamp)) { LOG.debug("[{}] Could not claim queryChangeStream({}), stopping", token, startTimestamp); return Optional.of(ProcessContinuation.stop()); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/DataChangeRecordAction.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/DataChangeRecordAction.java index 4ceda8afb3e6..555b1fefbebc 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/DataChangeRecordAction.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/DataChangeRecordAction.java @@ -21,9 +21,9 @@ import java.util.Optional; import org.apache.beam.sdk.io.gcp.spanner.changestreams.dofn.ReadChangeStreamPartitionDoFn; import org.apache.beam.sdk.io.gcp.spanner.changestreams.estimator.ThroughputEstimator; -import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.ChildPartitionsRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.DataChangeRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.PartitionMetadata; +import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.RestrictionInterrupter; import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.TimestampRange; import org.apache.beam.sdk.transforms.DoFn.OutputReceiver; import org.apache.beam.sdk.transforms.DoFn.ProcessContinuation; @@ -68,18 +68,22 @@ public DataChangeRecordAction(ThroughputEstimator throughputEs * @param partition the current partition being processed * @param record the change stream data record received * @param tracker the restriction tracker of the {@link ReadChangeStreamPartitionDoFn} SDF + * @param interrupter the restriction interrupter suggesting early termination of the processing * @param outputReceiver the output receiver of the {@link ReadChangeStreamPartitionDoFn} SDF * @param watermarkEstimator the watermark estimator of the {@link ReadChangeStreamPartitionDoFn} * SDF * @return {@link Optional#empty()} if the caller can continue processing more records. A non * empty {@link Optional} with {@link ProcessContinuation#stop()} if this function was unable - * to claim the {@link ChildPartitionsRecord} timestamp + * to claim the {@link DataChangeRecord} timestamp. A non empty {@link Optional} with {@link + * ProcessContinuation#resume()} if this function should commit what has already been + * processed and resume. */ @VisibleForTesting public Optional run( PartitionMetadata partition, DataChangeRecord record, RestrictionTracker tracker, + RestrictionInterrupter interrupter, OutputReceiver outputReceiver, ManualWatermarkEstimator watermarkEstimator) { @@ -88,6 +92,13 @@ public Optional run( final Timestamp commitTimestamp = record.getCommitTimestamp(); final Instant commitInstant = new Instant(commitTimestamp.toSqlTimestamp().getTime()); + if (interrupter.tryInterrupt(commitTimestamp)) { + LOG.debug( + "[{}] Soft deadline reached with data change record at {}, rescheduling", + token, + commitTimestamp); + return Optional.of(ProcessContinuation.resume()); + } if (!tracker.tryClaim(commitTimestamp)) { LOG.debug("[{}] Could not claim queryChangeStream({}), stopping", token, commitTimestamp); return Optional.of(ProcessContinuation.stop()); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/HeartbeatRecordAction.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/HeartbeatRecordAction.java index 83a232fe2093..0937e896fbf1 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/HeartbeatRecordAction.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/HeartbeatRecordAction.java @@ -22,6 +22,7 @@ import org.apache.beam.sdk.io.gcp.spanner.changestreams.ChangeStreamMetrics; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.HeartbeatRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.PartitionMetadata; +import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.RestrictionInterrupter; import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.TimestampRange; import org.apache.beam.sdk.transforms.DoFn.ProcessContinuation; import org.apache.beam.sdk.transforms.splittabledofn.ManualWatermarkEstimator; @@ -56,7 +57,9 @@ public class HeartbeatRecordAction { * not. If the {@link Optional} returned is empty, it means that the calling function can continue * with the processing. If an {@link Optional} of {@link ProcessContinuation#stop()} is returned, * it means that this function was unable to claim the timestamp of the {@link HeartbeatRecord}, - * so the caller should stop. + * so the caller should stop. If an {@link Optional} of {@link ProcessContinuation#resume()} is + * returned, it means that this function should not attempt to claim further timestamps of the + * {@link HeartbeatRecord}, but instead should commit what it has processed so far. * *

When processing the {@link HeartbeatRecord} the following procedure is applied: * @@ -72,6 +75,7 @@ public Optional run( PartitionMetadata partition, HeartbeatRecord record, RestrictionTracker tracker, + RestrictionInterrupter interrupter, ManualWatermarkEstimator watermarkEstimator) { final String token = partition.getPartitionToken(); @@ -79,6 +83,11 @@ public Optional run( final Timestamp timestamp = record.getTimestamp(); final Instant timestampInstant = new Instant(timestamp.toSqlTimestamp().getTime()); + if (interrupter.tryInterrupt(timestamp)) { + LOG.debug( + "[{}] Soft deadline reached with heartbeat record at {}, rescheduling", token, timestamp); + return Optional.of(ProcessContinuation.resume()); + } if (!tracker.tryClaim(timestamp)) { LOG.debug("[{}] Could not claim queryChangeStream({}), stopping", token, timestamp); return Optional.of(ProcessContinuation.stop()); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamAction.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamAction.java index 92285946e56f..6edbd544a37c 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamAction.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamAction.java @@ -33,6 +33,7 @@ import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.DataChangeRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.HeartbeatRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.PartitionMetadata; +import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.RestrictionInterrupter; import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.TimestampRange; import org.apache.beam.sdk.transforms.DoFn.BundleFinalizer; import org.apache.beam.sdk.transforms.DoFn.OutputReceiver; @@ -62,6 +63,13 @@ public class QueryChangeStreamAction { private static final Logger LOG = LoggerFactory.getLogger(QueryChangeStreamAction.class); private static final Duration BUNDLE_FINALIZER_TIMEOUT = Duration.standardMinutes(5); + /* + * Corresponds to the best effort timeout in case the restriction tracker cannot split the processing + * interval before the hard deadline. When reached it will assure that the already processed timestamps + * will be committed instead of thrown away (DEADLINE_EXCEEDED). The value should be less than + * the RetrySetting RPC timeout setting of SpannerIO#ReadChangeStream. + */ + private static final Duration RESTRICTION_TRACKER_TIMEOUT = Duration.standardSeconds(40); private static final String OUT_OF_RANGE_ERROR_MESSAGE = "Specified start_timestamp is invalid"; private final ChangeStreamDao changeStreamDao; @@ -164,6 +172,10 @@ public ProcessContinuation run( new IllegalStateException( "Partition " + token + " not found in metadata table")); + // Interrupter with soft timeout to commit the work if any records have been processed. + RestrictionInterrupter interrupter = + RestrictionInterrupter.withSoftTimeout(RESTRICTION_TRACKER_TIMEOUT); + try (ChangeStreamResultSet resultSet = changeStreamDao.changeStreamQuery( token, startTimestamp, endTimestamp, partition.getHeartbeatMillis())) { @@ -182,16 +194,25 @@ public ProcessContinuation run( updatedPartition, (DataChangeRecord) record, tracker, + interrupter, receiver, watermarkEstimator); } else if (record instanceof HeartbeatRecord) { maybeContinuation = heartbeatRecordAction.run( - updatedPartition, (HeartbeatRecord) record, tracker, watermarkEstimator); + updatedPartition, + (HeartbeatRecord) record, + tracker, + interrupter, + watermarkEstimator); } else if (record instanceof ChildPartitionsRecord) { maybeContinuation = childPartitionsRecordAction.run( - updatedPartition, (ChildPartitionsRecord) record, tracker, watermarkEstimator); + updatedPartition, + (ChildPartitionsRecord) record, + tracker, + interrupter, + watermarkEstimator); } else { LOG.error("[{}] Unknown record type {}", token, record.getClass()); throw new IllegalArgumentException("Unknown record type " + record.getClass()); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/restriction/RestrictionInterrupter.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/restriction/RestrictionInterrupter.java new file mode 100644 index 000000000000..37e91911867a --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/restriction/RestrictionInterrupter.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction; + +import java.util.function.Supplier; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; +import org.joda.time.Instant; + +/** An interrupter for restriction tracker of type T. */ +public class RestrictionInterrupter { + private @Nullable T lastAttemptedPosition; + + private Supplier timeSupplier; + private final Instant softDeadline; + private boolean hasInterrupted = true; + + /** + * Sets a soft timeout from now for processing new positions. After the timeout the tryInterrupt + * will start returning true indicating an early exit from processing. + */ + public static RestrictionInterrupter withSoftTimeout(Duration timeout) { + return new RestrictionInterrupter(() -> Instant.now(), timeout); + } + + RestrictionInterrupter(Supplier timeSupplier, Duration timeout) { + this.timeSupplier = timeSupplier; + this.softDeadline = this.timeSupplier.get().plus(timeout); + hasInterrupted = false; + } + + @VisibleForTesting + void setTimeSupplier(Supplier timeSupplier) { + this.timeSupplier = timeSupplier; + } + + /** + * Returns true if the restriction tracker should be interrupted in claiming new positions. + * + *

    + *
  1. If soft deadline hasn't been reached always returns false. + *
  2. If soft deadline has been reached but we haven't processed any positions returns false. + *
  3. If soft deadline has been reached but the new position is the same as the last attempted + * position returns false. + *
  4. If soft deadline has been reached and the new position differs from the last attempted + * position returns true. + *
+ * + * @return {@code true} if the position processing should continue, {@code false} if the soft + * deadline has been reached and we have fully processed the previous position. + */ + public boolean tryInterrupt(@NonNull T position) { + if (hasInterrupted) { + return true; + } + if (lastAttemptedPosition == null) { + lastAttemptedPosition = position; + return false; + } + if (position.equals(lastAttemptedPosition)) { + return false; + } + lastAttemptedPosition = position; + + hasInterrupted |= timeSupplier.get().isAfter(softDeadline); + return hasInterrupted; + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/ChildPartitionsRecordActionTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/ChildPartitionsRecordActionTest.java index 03d390ea0d5d..5815bf0c6fdd 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/ChildPartitionsRecordActionTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/ChildPartitionsRecordActionTest.java @@ -38,6 +38,7 @@ import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.ChildPartition; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.ChildPartitionsRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.PartitionMetadata; +import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.RestrictionInterrupter; import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.TimestampRange; import org.apache.beam.sdk.io.gcp.spanner.changestreams.util.TestTransactionAnswer; import org.apache.beam.sdk.transforms.DoFn.ProcessContinuation; @@ -55,6 +56,7 @@ public class ChildPartitionsRecordActionTest { private ChangeStreamMetrics metrics; private ChildPartitionsRecordAction action; private RestrictionTracker tracker; + private RestrictionInterrupter interrupter; private ManualWatermarkEstimator watermarkEstimator; @Before @@ -64,6 +66,7 @@ public void setUp() { metrics = mock(ChangeStreamMetrics.class); action = new ChildPartitionsRecordAction(dao, metrics); tracker = mock(RestrictionTracker.class); + interrupter = mock(RestrictionInterrupter.class); watermarkEstimator = mock(ManualWatermarkEstimator.class); when(dao.runInTransaction(any(), anyObject())) @@ -93,7 +96,7 @@ public void testRestrictionClaimedAndIsSplitCase() { when(transaction.getPartition("childPartition2")).thenReturn(null); final Optional maybeContinuation = - action.run(partition, record, tracker, watermarkEstimator); + action.run(partition, record, tracker, interrupter, watermarkEstimator); assertEquals(Optional.empty(), maybeContinuation); verify(watermarkEstimator).setWatermark(new Instant(startTimestamp.toSqlTimestamp().getTime())); @@ -144,7 +147,7 @@ public void testRestrictionClaimedAnsIsSplitCaseAndChildExists() { when(transaction.getPartition("childPartition2")).thenReturn(mock(Struct.class)); final Optional maybeContinuation = - action.run(partition, record, tracker, watermarkEstimator); + action.run(partition, record, tracker, interrupter, watermarkEstimator); assertEquals(Optional.empty(), maybeContinuation); verify(watermarkEstimator).setWatermark(new Instant(startTimestamp.toSqlTimestamp().getTime())); @@ -173,7 +176,7 @@ public void testRestrictionClaimedAndIsMergeCaseAndChildNotExists() { when(transaction.getPartition(childPartitionToken)).thenReturn(null); final Optional maybeContinuation = - action.run(partition, record, tracker, watermarkEstimator); + action.run(partition, record, tracker, interrupter, watermarkEstimator); assertEquals(Optional.empty(), maybeContinuation); verify(watermarkEstimator).setWatermark(new Instant(startTimestamp.toSqlTimestamp().getTime())); @@ -213,7 +216,7 @@ public void testRestrictionClaimedAndIsMergeCaseAndChildExists() { when(transaction.getPartition(childPartitionToken)).thenReturn(mock(Struct.class)); final Optional maybeContinuation = - action.run(partition, record, tracker, watermarkEstimator); + action.run(partition, record, tracker, interrupter, watermarkEstimator); assertEquals(Optional.empty(), maybeContinuation); verify(watermarkEstimator).setWatermark(new Instant(startTimestamp.toSqlTimestamp().getTime())); @@ -237,10 +240,35 @@ public void testRestrictionNotClaimed() { when(tracker.tryClaim(startTimestamp)).thenReturn(false); final Optional maybeContinuation = - action.run(partition, record, tracker, watermarkEstimator); + action.run(partition, record, tracker, interrupter, watermarkEstimator); assertEquals(Optional.of(ProcessContinuation.stop()), maybeContinuation); verify(watermarkEstimator, never()).setWatermark(any()); verify(dao, never()).insert(any()); } + + @Test + public void testSoftDeadlineReached() { + final String partitionToken = "partitionToken"; + final Timestamp startTimestamp = Timestamp.ofTimeMicroseconds(10L); + final PartitionMetadata partition = mock(PartitionMetadata.class); + final ChildPartitionsRecord record = + new ChildPartitionsRecord( + startTimestamp, + "recordSequence", + Arrays.asList( + new ChildPartition("childPartition1", partitionToken), + new ChildPartition("childPartition2", partitionToken)), + null); + when(partition.getPartitionToken()).thenReturn(partitionToken); + when(interrupter.tryInterrupt(startTimestamp)).thenReturn(true); + when(tracker.tryClaim(startTimestamp)).thenReturn(true); + + final Optional maybeContinuation = + action.run(partition, record, tracker, interrupter, watermarkEstimator); + + assertEquals(Optional.of(ProcessContinuation.resume()), maybeContinuation); + verify(watermarkEstimator, never()).setWatermark(any()); + verify(dao, never()).insert(any()); + } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/DataChangeRecordActionTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/DataChangeRecordActionTest.java index ac8d48725299..6569f810812c 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/DataChangeRecordActionTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/DataChangeRecordActionTest.java @@ -30,6 +30,7 @@ import org.apache.beam.sdk.io.gcp.spanner.changestreams.estimator.BytesThroughputEstimator; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.DataChangeRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.PartitionMetadata; +import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.RestrictionInterrupter; import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.TimestampRange; import org.apache.beam.sdk.transforms.DoFn.OutputReceiver; import org.apache.beam.sdk.transforms.DoFn.ProcessContinuation; @@ -44,6 +45,7 @@ public class DataChangeRecordActionTest { private DataChangeRecordAction action; private PartitionMetadata partition; private RestrictionTracker tracker; + private RestrictionInterrupter interrupter; private OutputReceiver outputReceiver; private ManualWatermarkEstimator watermarkEstimator; private BytesThroughputEstimator throughputEstimator; @@ -54,6 +56,7 @@ public void setUp() { action = new DataChangeRecordAction(throughputEstimator); partition = mock(PartitionMetadata.class); tracker = mock(RestrictionTracker.class); + interrupter = mock(RestrictionInterrupter.class); outputReceiver = mock(OutputReceiver.class); watermarkEstimator = mock(ManualWatermarkEstimator.class); } @@ -69,7 +72,7 @@ public void testRestrictionClaimed() { when(partition.getPartitionToken()).thenReturn(partitionToken); final Optional maybeContinuation = - action.run(partition, record, tracker, outputReceiver, watermarkEstimator); + action.run(partition, record, tracker, interrupter, outputReceiver, watermarkEstimator); assertEquals(Optional.empty(), maybeContinuation); verify(outputReceiver).outputWithTimestamp(record, instant); @@ -87,11 +90,30 @@ public void testRestrictionNotClaimed() { when(partition.getPartitionToken()).thenReturn(partitionToken); final Optional maybeContinuation = - action.run(partition, record, tracker, outputReceiver, watermarkEstimator); + action.run(partition, record, tracker, interrupter, outputReceiver, watermarkEstimator); assertEquals(Optional.of(ProcessContinuation.stop()), maybeContinuation); verify(outputReceiver, never()).outputWithTimestamp(any(), any()); verify(watermarkEstimator, never()).setWatermark(any()); verify(throughputEstimator, never()).update(any(), any()); } + + @Test + public void testSoftDeadlineReached() { + final String partitionToken = "partitionToken"; + final Timestamp timestamp = Timestamp.ofTimeMicroseconds(10L); + final DataChangeRecord record = mock(DataChangeRecord.class); + when(record.getCommitTimestamp()).thenReturn(timestamp); + when(interrupter.tryInterrupt(timestamp)).thenReturn(true); + when(tracker.tryClaim(timestamp)).thenReturn(true); + when(partition.getPartitionToken()).thenReturn(partitionToken); + + final Optional maybeContinuation = + action.run(partition, record, tracker, interrupter, outputReceiver, watermarkEstimator); + + assertEquals(Optional.of(ProcessContinuation.resume()), maybeContinuation); + verify(outputReceiver, never()).outputWithTimestamp(any(), any()); + verify(watermarkEstimator, never()).setWatermark(any()); + verify(throughputEstimator, never()).update(any(), any()); + } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/HeartbeatRecordActionTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/HeartbeatRecordActionTest.java index 77333bbbc96e..56d1825c8a18 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/HeartbeatRecordActionTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/HeartbeatRecordActionTest.java @@ -29,6 +29,7 @@ import org.apache.beam.sdk.io.gcp.spanner.changestreams.ChangeStreamMetrics; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.HeartbeatRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.PartitionMetadata; +import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.RestrictionInterrupter; import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.TimestampRange; import org.apache.beam.sdk.transforms.DoFn.ProcessContinuation; import org.apache.beam.sdk.transforms.splittabledofn.ManualWatermarkEstimator; @@ -42,6 +43,7 @@ public class HeartbeatRecordActionTest { private HeartbeatRecordAction action; private PartitionMetadata partition; private RestrictionTracker tracker; + private RestrictionInterrupter interrupter; private ManualWatermarkEstimator watermarkEstimator; @Before @@ -50,6 +52,7 @@ public void setUp() { action = new HeartbeatRecordAction(metrics); partition = mock(PartitionMetadata.class); tracker = mock(RestrictionTracker.class); + interrupter = mock(RestrictionInterrupter.class); watermarkEstimator = mock(ManualWatermarkEstimator.class); } @@ -62,7 +65,12 @@ public void testRestrictionClaimed() { when(partition.getPartitionToken()).thenReturn(partitionToken); final Optional maybeContinuation = - action.run(partition, new HeartbeatRecord(timestamp, null), tracker, watermarkEstimator); + action.run( + partition, + new HeartbeatRecord(timestamp, null), + tracker, + interrupter, + watermarkEstimator); assertEquals(Optional.empty(), maybeContinuation); verify(watermarkEstimator).setWatermark(new Instant(timestamp.toSqlTimestamp().getTime())); @@ -77,9 +85,35 @@ public void testRestrictionNotClaimed() { when(partition.getPartitionToken()).thenReturn(partitionToken); final Optional maybeContinuation = - action.run(partition, new HeartbeatRecord(timestamp, null), tracker, watermarkEstimator); + action.run( + partition, + new HeartbeatRecord(timestamp, null), + tracker, + interrupter, + watermarkEstimator); assertEquals(Optional.of(ProcessContinuation.stop()), maybeContinuation); verify(watermarkEstimator, never()).setWatermark(any()); } + + @Test + public void testSoftDeadlineReached() { + final String partitionToken = "partitionToken"; + final Timestamp timestamp = Timestamp.ofTimeMicroseconds(10L); + + when(interrupter.tryInterrupt(timestamp)).thenReturn(true); + when(tracker.tryClaim(timestamp)).thenReturn(true); + when(partition.getPartitionToken()).thenReturn(partitionToken); + + final Optional maybeContinuation = + action.run( + partition, + new HeartbeatRecord(timestamp, null), + tracker, + interrupter, + watermarkEstimator); + + assertEquals(Optional.of(ProcessContinuation.resume()), maybeContinuation); + verify(watermarkEstimator, never()).setWatermark(any()); + } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamActionTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamActionTest.java index bf7b0adfd475..c73a62a812bd 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamActionTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamActionTest.java @@ -20,6 +20,7 @@ import static org.apache.beam.sdk.io.gcp.spanner.changestreams.model.PartitionMetadata.State.SCHEDULED; import static org.junit.Assert.assertEquals; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; @@ -40,6 +41,7 @@ import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.DataChangeRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.HeartbeatRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.PartitionMetadata; +import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.RestrictionInterrupter; import org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction.TimestampRange; import org.apache.beam.sdk.transforms.DoFn.BundleFinalizer; import org.apache.beam.sdk.transforms.DoFn.OutputReceiver; @@ -144,10 +146,20 @@ public void testQueryChangeStreamWithDataChangeRecord() { when(changeStreamRecordMapper.toChangeStreamRecords(partition, resultSet, resultSetMetadata)) .thenReturn(Arrays.asList(record1, record2)); when(dataChangeRecordAction.run( - partition, record1, restrictionTracker, outputReceiver, watermarkEstimator)) + eq(partition), + eq(record1), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(outputReceiver), + eq(watermarkEstimator))) .thenReturn(Optional.empty()); when(dataChangeRecordAction.run( - partition, record2, restrictionTracker, outputReceiver, watermarkEstimator)) + eq(partition), + eq(record2), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(outputReceiver), + eq(watermarkEstimator))) .thenReturn(Optional.of(ProcessContinuation.stop())); when(watermarkEstimator.currentWatermark()).thenReturn(WATERMARK); @@ -157,13 +169,25 @@ public void testQueryChangeStreamWithDataChangeRecord() { assertEquals(ProcessContinuation.stop(), result); verify(dataChangeRecordAction) - .run(partition, record1, restrictionTracker, outputReceiver, watermarkEstimator); + .run( + eq(partition), + eq(record1), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(outputReceiver), + eq(watermarkEstimator)); verify(dataChangeRecordAction) - .run(partition, record2, restrictionTracker, outputReceiver, watermarkEstimator); + .run( + eq(partition), + eq(record2), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(outputReceiver), + eq(watermarkEstimator)); verify(partitionMetadataDao).updateWatermark(PARTITION_TOKEN, WATERMARK_TIMESTAMP); - verify(heartbeatRecordAction, never()).run(any(), any(), any(), any()); - verify(childPartitionsRecordAction, never()).run(any(), any(), any(), any()); + verify(heartbeatRecordAction, never()).run(any(), any(), any(), any(), any()); + verify(childPartitionsRecordAction, never()).run(any(), any(), any(), any(), any()); verify(restrictionTracker, never()).tryClaim(any()); } @@ -188,9 +212,19 @@ public void testQueryChangeStreamWithHeartbeatRecord() { when(resultSet.getMetadata()).thenReturn(resultSetMetadata); when(changeStreamRecordMapper.toChangeStreamRecords(partition, resultSet, resultSetMetadata)) .thenReturn(Arrays.asList(record1, record2)); - when(heartbeatRecordAction.run(partition, record1, restrictionTracker, watermarkEstimator)) + when(heartbeatRecordAction.run( + eq(partition), + eq(record1), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator))) .thenReturn(Optional.empty()); - when(heartbeatRecordAction.run(partition, record2, restrictionTracker, watermarkEstimator)) + when(heartbeatRecordAction.run( + eq(partition), + eq(record2), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator))) .thenReturn(Optional.of(ProcessContinuation.stop())); when(watermarkEstimator.currentWatermark()).thenReturn(WATERMARK); @@ -199,12 +233,24 @@ public void testQueryChangeStreamWithHeartbeatRecord() { partition, restrictionTracker, outputReceiver, watermarkEstimator, bundleFinalizer); assertEquals(ProcessContinuation.stop(), result); - verify(heartbeatRecordAction).run(partition, record1, restrictionTracker, watermarkEstimator); - verify(heartbeatRecordAction).run(partition, record2, restrictionTracker, watermarkEstimator); + verify(heartbeatRecordAction) + .run( + eq(partition), + eq(record1), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator)); + verify(heartbeatRecordAction) + .run( + eq(partition), + eq(record2), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator)); verify(partitionMetadataDao).updateWatermark(PARTITION_TOKEN, WATERMARK_TIMESTAMP); - verify(dataChangeRecordAction, never()).run(any(), any(), any(), any(), any()); - verify(childPartitionsRecordAction, never()).run(any(), any(), any(), any()); + verify(dataChangeRecordAction, never()).run(any(), any(), any(), any(), any(), any()); + verify(childPartitionsRecordAction, never()).run(any(), any(), any(), any(), any()); verify(restrictionTracker, never()).tryClaim(any()); } @@ -230,10 +276,18 @@ public void testQueryChangeStreamWithChildPartitionsRecord() { when(changeStreamRecordMapper.toChangeStreamRecords(partition, resultSet, resultSetMetadata)) .thenReturn(Arrays.asList(record1, record2)); when(childPartitionsRecordAction.run( - partition, record1, restrictionTracker, watermarkEstimator)) + eq(partition), + eq(record1), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator))) .thenReturn(Optional.empty()); when(childPartitionsRecordAction.run( - partition, record2, restrictionTracker, watermarkEstimator)) + eq(partition), + eq(record2), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator))) .thenReturn(Optional.of(ProcessContinuation.stop())); when(watermarkEstimator.currentWatermark()).thenReturn(WATERMARK); @@ -243,13 +297,23 @@ public void testQueryChangeStreamWithChildPartitionsRecord() { assertEquals(ProcessContinuation.stop(), result); verify(childPartitionsRecordAction) - .run(partition, record1, restrictionTracker, watermarkEstimator); + .run( + eq(partition), + eq(record1), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator)); verify(childPartitionsRecordAction) - .run(partition, record2, restrictionTracker, watermarkEstimator); + .run( + eq(partition), + eq(record2), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator)); verify(partitionMetadataDao).updateWatermark(PARTITION_TOKEN, WATERMARK_TIMESTAMP); - verify(dataChangeRecordAction, never()).run(any(), any(), any(), any(), any()); - verify(heartbeatRecordAction, never()).run(any(), any(), any(), any()); + verify(dataChangeRecordAction, never()).run(any(), any(), any(), any(), any(), any()); + verify(heartbeatRecordAction, never()).run(any(), any(), any(), any(), any()); verify(restrictionTracker, never()).tryClaim(any()); } @@ -279,7 +343,11 @@ public void testQueryChangeStreamWithRestrictionFromAfterPartitionStart() { when(changeStreamRecordMapper.toChangeStreamRecords(partition, resultSet, resultSetMetadata)) .thenReturn(Arrays.asList(record1, record2)); when(childPartitionsRecordAction.run( - partition, record2, restrictionTracker, watermarkEstimator)) + eq(partition), + eq(record2), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator))) .thenReturn(Optional.of(ProcessContinuation.stop())); when(watermarkEstimator.currentWatermark()).thenReturn(WATERMARK); @@ -289,13 +357,23 @@ public void testQueryChangeStreamWithRestrictionFromAfterPartitionStart() { assertEquals(ProcessContinuation.stop(), result); verify(childPartitionsRecordAction) - .run(partition, record1, restrictionTracker, watermarkEstimator); + .run( + eq(partition), + eq(record1), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator)); verify(childPartitionsRecordAction) - .run(partition, record2, restrictionTracker, watermarkEstimator); + .run( + eq(partition), + eq(record2), + eq(restrictionTracker), + any(RestrictionInterrupter.class), + eq(watermarkEstimator)); verify(partitionMetadataDao).updateWatermark(PARTITION_TOKEN, WATERMARK_TIMESTAMP); - verify(dataChangeRecordAction, never()).run(any(), any(), any(), any(), any()); - verify(heartbeatRecordAction, never()).run(any(), any(), any(), any()); + verify(dataChangeRecordAction, never()).run(any(), any(), any(), any(), any(), any()); + verify(heartbeatRecordAction, never()).run(any(), any(), any(), any(), any()); verify(restrictionTracker, never()).tryClaim(any()); } @@ -320,9 +398,9 @@ public void testQueryChangeStreamWithStreamFinished() { verify(partitionMetadataDao).updateWatermark(PARTITION_TOKEN, WATERMARK_TIMESTAMP); verify(partitionMetadataDao).updateToFinished(PARTITION_TOKEN); - verify(dataChangeRecordAction, never()).run(any(), any(), any(), any(), any()); - verify(heartbeatRecordAction, never()).run(any(), any(), any(), any()); - verify(childPartitionsRecordAction, never()).run(any(), any(), any(), any()); + verify(dataChangeRecordAction, never()).run(any(), any(), any(), any(), any(), any()); + verify(heartbeatRecordAction, never()).run(any(), any(), any(), any(), any()); + verify(childPartitionsRecordAction, never()).run(any(), any(), any(), any(), any()); } private static class BundleFinalizerStub implements BundleFinalizer { diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dofn/ReadChangeStreamPartitionDoFnTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dofn/ReadChangeStreamPartitionDoFnTest.java index 538bdf768664..87588eb8d0a9 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dofn/ReadChangeStreamPartitionDoFnTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dofn/ReadChangeStreamPartitionDoFnTest.java @@ -149,9 +149,9 @@ public void testQueryChangeStreamMode() { verify(queryChangeStreamAction) .run(partition, tracker, receiver, watermarkEstimator, bundleFinalizer); - verify(dataChangeRecordAction, never()).run(any(), any(), any(), any(), any()); - verify(heartbeatRecordAction, never()).run(any(), any(), any(), any()); - verify(childPartitionsRecordAction, never()).run(any(), any(), any(), any()); + verify(dataChangeRecordAction, never()).run(any(), any(), any(), any(), any(), any()); + verify(heartbeatRecordAction, never()).run(any(), any(), any(), any(), any()); + verify(childPartitionsRecordAction, never()).run(any(), any(), any(), any(), any()); verify(tracker, never()).tryClaim(any()); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/restriction/RestrictionInterrupterTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/restriction/RestrictionInterrupterTest.java new file mode 100644 index 000000000000..6d376ec528ba --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/restriction/RestrictionInterrupterTest.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.spanner.changestreams.restriction; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.junit.Test; + +public class RestrictionInterrupterTest { + + @Test + public void testTryInterrupt() { + RestrictionInterrupter interrupter = + new RestrictionInterrupter( + () -> Instant.ofEpochSecond(0), Duration.standardSeconds(30)); + interrupter.setTimeSupplier(() -> Instant.ofEpochSecond(10)); + assertFalse(interrupter.tryInterrupt(1)); + interrupter.setTimeSupplier(() -> Instant.ofEpochSecond(15)); + assertFalse(interrupter.tryInterrupt(2)); + interrupter.setTimeSupplier(() -> Instant.ofEpochSecond(30)); + assertFalse(interrupter.tryInterrupt(3)); + interrupter.setTimeSupplier(() -> Instant.ofEpochSecond(40)); + // Though the deadline has passed same position as previously accepted is not interrupted. + assertFalse(interrupter.tryInterrupt(3)); + assertTrue(interrupter.tryInterrupt(4)); + assertTrue(interrupter.tryInterrupt(5)); + interrupter.setTimeSupplier(() -> Instant.ofEpochSecond(50)); + assertTrue(interrupter.tryInterrupt(5)); + // Even with non-monotonic clock the interrupter will now always interrupt. + interrupter.setTimeSupplier(() -> Instant.ofEpochSecond(40)); + assertTrue(interrupter.tryInterrupt(5)); + } + + @Test + public void testTryInterruptNoPreviousPosition() { + RestrictionInterrupter interrupter = + new RestrictionInterrupter( + () -> Instant.ofEpochSecond(0), Duration.standardSeconds(30)); + interrupter.setTimeSupplier(() -> Instant.ofEpochSecond(40)); + assertFalse(interrupter.tryInterrupt(1)); + // Though the deadline has passed same position as previously accepted is not interrupted. + assertFalse(interrupter.tryInterrupt(1)); + assertTrue(interrupter.tryInterrupt(2)); + interrupter.setTimeSupplier(() -> Instant.ofEpochSecond(50)); + assertTrue(interrupter.tryInterrupt(3)); + } +} From 1d5a3e8ee6e463ae268116d9baf854369446b71d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20Ram=C3=ADrez?= Date: Tue, 7 Jan 2025 15:18:20 +0100 Subject: [PATCH 25/43] Enable caching in Python tests workflow (#33355) * Enable caching in Python tests workflow As can be seen in [BuildBudget's demo](https://buildbudget.dev/demo/workflow/2083803/), this workflow costs ~$2k/month. This change should reduce the time it takes and eventually its cost by using standard caching techniques. * fixup! Enable caching in Python tests workflow * removed unnecessary input * fixup! removed unnecessary input --- .../setup-environment-action/action.yml | 28 ++++++++++++++++++- .github/workflows/python_tests.yml | 4 +-- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/.github/actions/setup-environment-action/action.yml b/.github/actions/setup-environment-action/action.yml index 72246566290d..9c8c0af416cd 100644 --- a/.github/actions/setup-environment-action/action.yml +++ b/.github/actions/setup-environment-action/action.yml @@ -34,15 +34,41 @@ inputs: required: false description: 'Whether to disable the gradle cache' default: false + python-cache: + required: false + description: 'Whether to enable Python pip caching' + default: true + tox-cache: + required: false + description: 'Whether to enable tox environment caching' + default: true runs: using: "composite" steps: - name: Install Python if: ${{ inputs.python-version != '' }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ inputs.python-version == 'default' && '3.9' || inputs.python-version }} + cache: ${{ inputs.python-cache && 'pip' || 'none' }} + cache-dependency-path: | + sdks/python/setup.py + sdks/python/tox.ini + + - name: Cache tox environments + if: ${{ inputs.python-version != '' && inputs.tox-cache == 'true' }} + uses: actions/cache@v3 + with: + path: | + sdks/python/target/.tox + !sdks/python/target/.tox/**/log + !sdks/python/target/.tox/.package_cache + key: tox-${{ runner.os }}-py${{ inputs.python-version == 'default' && '39' || inputs.python-version }}-${{ hashFiles('sdks/python/tox.ini') }}-${{ hashFiles('sdks/python/setup.py') }} + restore-keys: | + tox-${{ runner.os }}-py${{ inputs.python-version == 'default' && '39' || inputs.python-version }}-${{ hashFiles('sdks/python/tox.ini') }}- + tox-${{ runner.os }}-py${{ inputs.python-version == 'default' && '39' || inputs.python-version }}- + - name: Install Java if: ${{ inputs.java-version != '' }} uses: actions/setup-java@v3 diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 2c3b39a33c1d..fc6d4566ea5d 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -93,8 +93,8 @@ jobs: matrix: os: [macos-latest, windows-latest] params: [ - {"py_ver": "3.9", "tox_env": "py39"}, - {"py_ver": "3.10", "tox_env": "py310" }, + { "py_ver": "3.9", "tox_env": "py39" }, + { "py_ver": "3.10", "tox_env": "py310" }, { "py_ver": "3.11", "tox_env": "py311" }, { "py_ver": "3.12", "tox_env": "py312" }, ] From f17bb8d5cf4da276f14bfdaaaba23eeeaa1bf77c Mon Sep 17 00:00:00 2001 From: Bartosz Zablocki Date: Tue, 7 Jan 2025 16:11:19 +0100 Subject: [PATCH 26/43] SolaceIO: separate auth and session settings (#32406) * Refactored to separate authentication and session settings, and allow inheritance and overriding of SessionService * Improve methods' javadoc --- .../apache/beam/sdk/io/solace/SolaceIO.java | 7 +- .../BasicAuthJcsmpSessionServiceFactory.java | 26 +-- ...nService.java => JcsmpSessionService.java} | 79 ++----- .../sdk/io/solace/broker/SessionService.java | 42 ++-- .../solace/broker/SessionServiceFactory.java | 39 +++- .../io/solace/MockEmptySessionService.java | 2 +- .../sdk/io/solace/MockSessionService.java | 4 +- ...lsBasicAuthJcsmpSessionServiceFactory.java | 62 ++++++ ...SolaceIOCustomSessionServiceFactoryIT.java | 193 ++++++++++++++++++ .../beam/sdk/io/solace/it/SolaceIOIT.java | 1 + 10 files changed, 345 insertions(+), 110 deletions(-) rename sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/{BasicAuthJcsmpSessionService.java => JcsmpSessionService.java} (74%) create mode 100644 sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/FixedCredentialsBasicAuthJcsmpSessionServiceFactory.java create mode 100644 sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOCustomSessionServiceFactoryIT.java diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java index a55d8a0a4217..63509126022b 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java @@ -209,7 +209,7 @@ * *

See {@link Read#withSessionServiceFactory(SessionServiceFactory)} for session authentication. * The connector provides implementation of the {@link SessionServiceFactory} using the Basic - * Authentication: {@link org.apache.beam.sdk.io.solace.broker.BasicAuthJcsmpSessionService}. + * Authentication: {@link org.apache.beam.sdk.io.solace.broker.BasicAuthJcsmpSessionServiceFactory}. * *

For the authentication to the SEMP API ({@link Read#withSempClientFactory(SempClientFactory)}) * the connector provides {@link org.apache.beam.sdk.io.solace.broker.BasicAuthSempClientFactory} to @@ -639,9 +639,8 @@ public Read withSempClientFactory(SempClientFactory sempClientFactory) { *

  • create a {@link org.apache.beam.sdk.io.solace.broker.MessageReceiver}. * * - *

    An existing implementation of the SempClientFactory includes {@link - * org.apache.beam.sdk.io.solace.broker.BasicAuthJcsmpSessionService} which implements the Basic - * Authentication to Solace. * + *

    The {@link BasicAuthJcsmpSessionServiceFactory} is an existing implementation of the + * {@link SessionServiceFactory} which implements the Basic Authentication to Solace. * *

    To use it, specify the credentials with the builder methods. * * diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionServiceFactory.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionServiceFactory.java index 199dcccee854..5c3952e6b30f 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionServiceFactory.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionServiceFactory.java @@ -20,13 +20,14 @@ import static org.apache.beam.sdk.io.solace.broker.SessionService.DEFAULT_VPN_NAME; import com.google.auto.value.AutoValue; +import com.solacesystems.jcsmp.JCSMPProperties; /** - * A factory for creating {@link BasicAuthJcsmpSessionService} instances. Extends {@link + * A factory for creating {@link JcsmpSessionService} instances. Extends {@link * SessionServiceFactory}. * - *

    This factory provides a way to create {@link BasicAuthJcsmpSessionService} instances with - * authenticate to Solace with Basic Authentication. + *

    This factory provides a way to create {@link JcsmpSessionService} that use Basic + * Authentication. */ @AutoValue public abstract class BasicAuthJcsmpSessionServiceFactory extends SessionServiceFactory { @@ -69,15 +70,14 @@ public abstract static class Builder { @Override public SessionService create() { - BasicAuthJcsmpSessionService.Builder builder = BasicAuthJcsmpSessionService.builder(); - if (queue != null) { - builder = builder.queueName(queue.getName()); - } - return builder - .host(host()) - .username(username()) - .password(password()) - .vpnName(vpnName()) - .build(); + JCSMPProperties jcsmpProperties = new JCSMPProperties(); + jcsmpProperties.setProperty(JCSMPProperties.VPN_NAME, vpnName()); + jcsmpProperties.setProperty( + JCSMPProperties.AUTHENTICATION_SCHEME, JCSMPProperties.AUTHENTICATION_SCHEME_BASIC); + jcsmpProperties.setProperty(JCSMPProperties.USERNAME, username()); + jcsmpProperties.setProperty(JCSMPProperties.PASSWORD, password()); + jcsmpProperties.setProperty(JCSMPProperties.HOST, host()); + + return JcsmpSessionService.create(jcsmpProperties, getQueue()); } } diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionService.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/JcsmpSessionService.java similarity index 74% rename from sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionService.java rename to sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/JcsmpSessionService.java index d4c9a3ec6210..818368a92b9f 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionService.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/JcsmpSessionService.java @@ -43,45 +43,16 @@ /** * A class that manages a connection to a Solace broker using basic authentication. * - *

    This class provides a way to connect to a Solace broker and receive messages from a queue. The - * connection is established using basic authentication. + *

    This class provides a way to connect to a Solace broker and receive messages from a queue. */ @AutoValue -public abstract class BasicAuthJcsmpSessionService extends SessionService { +public abstract class JcsmpSessionService extends SessionService { - /** The name of the queue to receive messages from. */ - public abstract @Nullable String queueName(); + /** JCSMP properties used to establish the connection. */ + abstract JCSMPProperties jcsmpProperties(); - /** The host name or IP address of the Solace broker. Format: Host[:Port] */ - public abstract String host(); - - /** The username to use for authentication. */ - public abstract String username(); - - /** The password to use for authentication. */ - public abstract String password(); - - /** The name of the VPN to connect to. */ - public abstract String vpnName(); - - public static Builder builder() { - return new AutoValue_BasicAuthJcsmpSessionService.Builder().vpnName(DEFAULT_VPN_NAME); - } - - @AutoValue.Builder - public abstract static class Builder { - public abstract Builder queueName(@Nullable String queueName); - - public abstract Builder host(String host); - - public abstract Builder username(String username); - - public abstract Builder password(String password); - - public abstract Builder vpnName(String vpnName); - - public abstract BasicAuthJcsmpSessionService build(); - } + /** The Queue to receive messages from. */ + abstract @Nullable Queue queue(); @Nullable private transient JCSMPSession jcsmpSession; @Nullable private transient MessageReceiver messageReceiver; @@ -90,9 +61,19 @@ public abstract static class Builder { new ConcurrentLinkedQueue<>(); private final RetryCallableManager retryCallableManager = RetryCallableManager.create(); + public static JcsmpSessionService create(JCSMPProperties jcsmpProperties, @Nullable Queue queue) { + return new AutoValue_JcsmpSessionService(jcsmpProperties, queue); + } + + @Override + public JCSMPProperties getSessionProperties() { + return jcsmpProperties(); + } + @Override public void connect() { - retryCallableManager.retryCallable(this::connectSession, ImmutableSet.of(JCSMPException.class)); + retryCallableManager.retryCallable( + this::connectReadSession, ImmutableSet.of(JCSMPException.class)); } @Override @@ -158,10 +139,7 @@ private MessageProducer createXMLMessageProducer(SubmissionMode submissionMode) } private MessageReceiver createFlowReceiver() throws JCSMPException, IOException { - - Queue queue = - JCSMPFactory.onlyInstance() - .createQueue(checkStateNotNull(queueName(), "SolaceIO.Read: Queue is not set.")); + Queue queue = checkStateNotNull(queue(), "SolaceIO.Read: Queue is not set."); ConsumerFlowProperties flowProperties = new ConsumerFlowProperties(); flowProperties.setEndpoint(queue); @@ -190,9 +168,9 @@ private static FlowReceiver createFlowReceiver( return jcsmpSession.createFlow(null, flowProperties, endpointProperties); } - private int connectSession() throws JCSMPException { + private int connectReadSession() throws JCSMPException { if (jcsmpSession == null) { - jcsmpSession = createSessionObject(); + jcsmpSession = createReadSessionObject(); } jcsmpSession.connect(); return 0; @@ -206,25 +184,12 @@ private int connectWriteSession(SubmissionMode mode) throws JCSMPException { return 0; } - private JCSMPSession createSessionObject() throws InvalidPropertiesException { - JCSMPProperties properties = initializeSessionProperties(new JCSMPProperties()); - return JCSMPFactory.onlyInstance().createSession(properties); + private JCSMPSession createReadSessionObject() throws InvalidPropertiesException { + return JCSMPFactory.onlyInstance().createSession(jcsmpProperties()); } private JCSMPSession createWriteSessionObject(SubmissionMode mode) throws InvalidPropertiesException { return JCSMPFactory.onlyInstance().createSession(initializeWriteSessionProperties(mode)); } - - @Override - public JCSMPProperties initializeSessionProperties(JCSMPProperties baseProps) { - baseProps.setProperty(JCSMPProperties.VPN_NAME, vpnName()); - - baseProps.setProperty( - JCSMPProperties.AUTHENTICATION_SCHEME, JCSMPProperties.AUTHENTICATION_SCHEME_BASIC); - baseProps.setProperty(JCSMPProperties.USERNAME, username()); - baseProps.setProperty(JCSMPProperties.PASSWORD, password()); - baseProps.setProperty(JCSMPProperties.HOST, host()); - return baseProps; - } } diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionService.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionService.java index 6dcd0b652616..5342f86c13e1 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionService.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionService.java @@ -32,8 +32,8 @@ * messaging system. It allows for establishing a connection, creating a message-receiver object, * checking if the connection is closed or not, and gracefully closing the session. * - *

    Override this class and the method {@link #initializeSessionProperties(JCSMPProperties)} with - * your specific properties, including all those related to authentication. + *

    Override this class and the method {@link #getSessionProperties()} with your specific + * properties, including all those related to authentication. * *

    The connector will call the method only once per session created, so you can perform * relatively heavy operations in that method (e.g. connect to a store or vault to retrieve @@ -70,8 +70,7 @@ *

    The connector ensures that no two threads will be calling that method at the same time, so you * don't have to take any specific precautions to avoid race conditions. * - *

    For basic authentication, use {@link BasicAuthJcsmpSessionService} and {@link - * BasicAuthJcsmpSessionServiceFactory}. + *

    For basic authentication, use {@link BasicAuthJcsmpSessionServiceFactory}. * *

    For other situations, you need to extend this class and implement the `equals` method, so two * instances of your class can be compared by value. We recommend using AutoValue for that. For @@ -143,11 +142,7 @@ public abstract class SessionService implements Serializable { /** * Override this method and provide your specific properties, including all those related to - * authentication, and possibly others too. The {@code}baseProperties{@code} parameter sets the - * Solace VPN to "default" if none is specified. - * - *

    You should add your properties to the parameter {@code}baseProperties{@code}, and return the - * result. + * authentication, and possibly others too. * *

    The method will be used whenever the session needs to be created or refreshed. If you are * setting credentials with expiration, just make sure that the latest available credentials (e.g. @@ -160,7 +155,7 @@ public abstract class SessionService implements Serializable { * href="https://docs.solace.com/API-Developer-Online-Ref-Documentation/java/com/solacesystems/jcsmp/JCSMPProperties.html">https://docs.solace.com/API-Developer-Online-Ref-Documentation/java/com/solacesystems/jcsmp/JCSMPProperties.html * */ - public abstract JCSMPProperties initializeSessionProperties(JCSMPProperties baseProperties); + public abstract JCSMPProperties getSessionProperties(); /** * You need to override this method to be able to compare these objects by value. We recommend @@ -187,22 +182,10 @@ public abstract class SessionService implements Serializable { * token), this method will be called again. */ public final JCSMPProperties initializeWriteSessionProperties(SolaceIO.SubmissionMode mode) { - JCSMPProperties jcsmpProperties = initializeSessionProperties(getDefaultProperties()); + JCSMPProperties jcsmpProperties = getSessionProperties(); return overrideConnectorProperties(jcsmpProperties, mode); } - private static JCSMPProperties getDefaultProperties() { - JCSMPProperties props = new JCSMPProperties(); - props.setProperty(JCSMPProperties.VPN_NAME, DEFAULT_VPN_NAME); - // Outgoing messages will have a sender timestamp field populated - props.setProperty(JCSMPProperties.GENERATE_SEND_TIMESTAMPS, true); - // Make XMLProducer safe to access from several threads. This is the default value, setting - // it just in case. - props.setProperty(JCSMPProperties.PUB_MULTI_THREAD, true); - - return props; - } - /** * This method overrides some properties for the broker session to prevent misconfiguration, * taking into account how the write connector works. @@ -210,6 +193,19 @@ private static JCSMPProperties getDefaultProperties() { private static JCSMPProperties overrideConnectorProperties( JCSMPProperties props, SolaceIO.SubmissionMode mode) { + if (props.getProperty(JCSMPProperties.VPN_NAME) == null) { + props.setProperty(JCSMPProperties.VPN_NAME, DEFAULT_VPN_NAME); + } + if (props.getProperty(JCSMPProperties.VPN_NAME) == null) { + // Outgoing messages will have a sender timestamp field populated + props.setProperty(JCSMPProperties.GENERATE_SEND_TIMESTAMPS, true); + } + if (props.getProperty(JCSMPProperties.VPN_NAME) == null) { + // Make XMLProducer safe to access from several threads. This is the default value, setting + // it just in case. + props.setProperty(JCSMPProperties.PUB_MULTI_THREAD, true); + } + // PUB_ACK_WINDOW_SIZE heavily affects performance when publishing persistent // messages. It can be a value between 1 and 255. This is the batch size for the ack // received from Solace. A value of 1 will have the lowest latency, but a very low diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionServiceFactory.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionServiceFactory.java index bd1f3c23694d..fe49e3493657 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionServiceFactory.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionServiceFactory.java @@ -20,6 +20,7 @@ import com.solacesystems.jcsmp.Queue; import java.io.Serializable; import org.apache.beam.sdk.io.solace.SolaceIO.SubmissionMode; +import org.apache.beam.sdk.io.solace.data.Solace; import org.checkerframework.checker.nullness.qual.Nullable; /** @@ -61,7 +62,7 @@ public abstract class SessionServiceFactory implements Serializable { * This could be used to associate the created SessionService with a specific queue for message * handling. */ - @Nullable Queue queue; + private @Nullable Queue queue; /** * The write submission mode. This is set when the writers are created. This property is used only @@ -75,6 +76,33 @@ public abstract class SessionServiceFactory implements Serializable { */ public abstract SessionService create(); + /** + * This method is called in the {@link + * org.apache.beam.sdk.io.solace.SolaceIO.Read#expand(org.apache.beam.sdk.values.PBegin)} method + * to set the Queue reference based on {@link + * org.apache.beam.sdk.io.solace.SolaceIO.Read#from(Solace.Queue)} or {@link + * org.apache.beam.sdk.io.solace.SolaceIO.Read#from(Solace.Topic)}. The queue can be retrieved in + * the classes that inherit {@link SessionServiceFactory} with the getter method {@link + * SessionServiceFactory#getQueue()} + */ + public final void setQueue(Queue queue) { + this.queue = queue; + } + + /** + * Getter for the queue. This is nullable, because at the construction time this reference is + * null. Once the pipeline is compiled and the {@link + * org.apache.beam.sdk.io.solace.SolaceIO.Read#expand(org.apache.beam.sdk.values.PBegin)} method + * is called, this reference is valid. + * + * @return a reference to the queue which is set with the {@link + * org.apache.beam.sdk.io.solace.SolaceIO.Read#from(Solace.Queue)} or {@link + * org.apache.beam.sdk.io.solace.SolaceIO.Read#from(Solace.Topic)} + */ + public final @Nullable Queue getQueue() { + return queue; + } + /** * You need to override this method to be able to compare these objects by value. We recommend * using AutoValue for that. @@ -89,15 +117,6 @@ public abstract class SessionServiceFactory implements Serializable { @Override public abstract int hashCode(); - /** - * This method is called in the {@link - * org.apache.beam.sdk.io.solace.SolaceIO.Read#expand(org.apache.beam.sdk.values.PBegin)} method - * to set the Queue reference. - */ - public void setQueue(Queue queue) { - this.queue = queue; - } - /** * Called by the write connector to set the submission mode used to create the message producers. */ diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockEmptySessionService.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockEmptySessionService.java index 7631d32f63cc..f6bb67419541 100644 --- a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockEmptySessionService.java +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockEmptySessionService.java @@ -61,7 +61,7 @@ public void connect() { } @Override - public JCSMPProperties initializeSessionProperties(JCSMPProperties baseProperties) { + public JCSMPProperties getSessionProperties() { throw new UnsupportedOperationException(exceptionMessage); } } diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockSessionService.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockSessionService.java index 6d28bcefc84c..e888c62c8522 100644 --- a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockSessionService.java +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockSessionService.java @@ -102,11 +102,11 @@ public Queue getPublishedResultsQueue() { public void connect() {} @Override - public JCSMPProperties initializeSessionProperties(JCSMPProperties baseProperties) { + public JCSMPProperties getSessionProperties() { // Let's override some properties that will be overriden by the connector // Opposite of the mode, to test that is overriden + JCSMPProperties baseProperties = new JCSMPProperties(); baseProperties.setProperty(JCSMPProperties.MESSAGE_CALLBACK_ON_REACTOR, callbackOnReactor); - baseProperties.setProperty(JCSMPProperties.PUB_ACK_WINDOW_SIZE, ackWindowSizeForTesting); return baseProperties; diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/FixedCredentialsBasicAuthJcsmpSessionServiceFactory.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/FixedCredentialsBasicAuthJcsmpSessionServiceFactory.java new file mode 100644 index 000000000000..2fdee6f9138d --- /dev/null +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/FixedCredentialsBasicAuthJcsmpSessionServiceFactory.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.solace.it; + +import com.solacesystems.jcsmp.JCSMPProperties; +import java.util.Objects; +import org.apache.beam.sdk.io.solace.broker.JcsmpSessionService; +import org.apache.beam.sdk.io.solace.broker.SessionService; +import org.apache.beam.sdk.io.solace.broker.SessionServiceFactory; + +public class FixedCredentialsBasicAuthJcsmpSessionServiceFactory extends SessionServiceFactory { + private final String host; + + public FixedCredentialsBasicAuthJcsmpSessionServiceFactory(String host) { + this.host = host; + } + + @Override + public SessionService create() { + JCSMPProperties jcsmpProperties = new JCSMPProperties(); + jcsmpProperties.setProperty(JCSMPProperties.VPN_NAME, SolaceContainerManager.VPN_NAME); + jcsmpProperties.setProperty( + JCSMPProperties.AUTHENTICATION_SCHEME, JCSMPProperties.AUTHENTICATION_SCHEME_BASIC); + jcsmpProperties.setProperty(JCSMPProperties.USERNAME, SolaceContainerManager.USERNAME); + jcsmpProperties.setProperty(JCSMPProperties.PASSWORD, SolaceContainerManager.PASSWORD); + jcsmpProperties.setProperty(JCSMPProperties.HOST, host); + return JcsmpSessionService.create(jcsmpProperties, getQueue()); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + FixedCredentialsBasicAuthJcsmpSessionServiceFactory that = + (FixedCredentialsBasicAuthJcsmpSessionServiceFactory) o; + return Objects.equals(host, that.host); + } + + @Override + public int hashCode() { + return Objects.hashCode(host); + } +} diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOCustomSessionServiceFactoryIT.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOCustomSessionServiceFactoryIT.java new file mode 100644 index 000000000000..b047026a2bbd --- /dev/null +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOCustomSessionServiceFactoryIT.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.solace.it; + +import static org.apache.beam.sdk.io.solace.it.SolaceContainerManager.TOPIC_NAME; +import static org.apache.beam.sdk.values.TypeDescriptors.strings; +import static org.junit.Assert.assertEquals; + +import com.solacesystems.jcsmp.DeliveryMode; +import java.io.IOException; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.extensions.avro.coders.AvroCoder; +import org.apache.beam.sdk.io.solace.SolaceIO; +import org.apache.beam.sdk.io.solace.SolaceIO.WriterType; +import org.apache.beam.sdk.io.solace.broker.BasicAuthJcsmpSessionServiceFactory; +import org.apache.beam.sdk.io.solace.broker.BasicAuthSempClientFactory; +import org.apache.beam.sdk.io.solace.data.Solace; +import org.apache.beam.sdk.io.solace.data.Solace.Queue; +import org.apache.beam.sdk.io.solace.data.SolaceDataUtils; +import org.apache.beam.sdk.io.solace.write.SolaceOutput; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.StreamingOptions; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestPipelineOptions; +import org.apache.beam.sdk.testing.TestStream; +import org.apache.beam.sdk.testutils.metrics.MetricsReader; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; + +public class SolaceIOCustomSessionServiceFactoryIT { + private static final String NAMESPACE = SolaceIOCustomSessionServiceFactoryIT.class.getName(); + private static final String READ_COUNT = "read_count"; + private static final String QUEUE_NAME = "test_queue"; + private static final long PUBLISH_MESSAGE_COUNT = 20; + private static final TestPipelineOptions pipelineOptions; + private static SolaceContainerManager solaceContainerManager; + + static { + pipelineOptions = PipelineOptionsFactory.create().as(TestPipelineOptions.class); + pipelineOptions.as(StreamingOptions.class).setStreaming(true); + // For the read connector tests, we need to make sure that p.run() does not block + pipelineOptions.setBlockOnRun(false); + pipelineOptions.as(TestPipelineOptions.class).setBlockOnRun(false); + } + + @Rule public final TestPipeline pipeline = TestPipeline.fromOptions(pipelineOptions); + + @BeforeClass + public static void setup() throws IOException { + solaceContainerManager = new SolaceContainerManager(); + solaceContainerManager.start(); + solaceContainerManager.createQueueWithSubscriptionTopic(QUEUE_NAME); + } + + @AfterClass + public static void afterClass() { + if (solaceContainerManager != null) { + solaceContainerManager.stop(); + } + } + + /** + * This test verifies ability to create a custom {@link + * org.apache.beam.sdk.io.solace.broker.SessionServiceFactory} and presents an example of doing + * that with the custom {@link FixedCredentialsBasicAuthJcsmpSessionServiceFactory}. + */ + @Test + public void test01writeAndReadWithCustomSessionServiceFactory() { + Pipeline writerPipeline = createWriterPipeline(WriterType.BATCHED); + writerPipeline + .apply( + "Read from Solace", + SolaceIO.read() + .from(Queue.fromName(QUEUE_NAME)) + .withMaxNumConnections(1) + .withDeduplicateRecords(true) + .withSempClientFactory( + BasicAuthSempClientFactory.builder() + .host("http://localhost:" + solaceContainerManager.sempPortMapped) + .username("admin") + .password("admin") + .vpnName(SolaceContainerManager.VPN_NAME) + .build()) + .withSessionServiceFactory( + new FixedCredentialsBasicAuthJcsmpSessionServiceFactory( + "localhost:" + solaceContainerManager.jcsmpPortMapped))) + .apply("Count", ParDo.of(new CountingFn<>(NAMESPACE, READ_COUNT))); + + PipelineResult pipelineResult = writerPipeline.run(); + // We need enough time for Beam to pull all messages from the queue, but we need a timeout too, + // as the Read connector will keep attempting to read forever. + pipelineResult.waitUntilFinish(Duration.standardSeconds(15)); + + MetricsReader metricsReader = new MetricsReader(pipelineResult, NAMESPACE); + long actualRecordsCount = metricsReader.getCounterMetric(READ_COUNT); + assertEquals(PUBLISH_MESSAGE_COUNT, actualRecordsCount); + } + + private Pipeline createWriterPipeline(WriterType writerType) { + TestStream.Builder> kvBuilder = + TestStream.create(KvCoder.of(AvroCoder.of(String.class), AvroCoder.of(String.class))) + .advanceWatermarkTo(Instant.EPOCH); + + for (int i = 0; i < PUBLISH_MESSAGE_COUNT; i++) { + String key = "Solace-Message-ID:m" + i; + String payload = String.format("{\"field_str\":\"value\",\"field_int\":123%d}", i); + kvBuilder = + kvBuilder + .addElements(KV.of(key, payload)) + .advanceProcessingTime(Duration.standardSeconds(60)); + } + + TestStream> testStream = kvBuilder.advanceWatermarkToInfinity(); + + PCollection> kvs = + pipeline.apply(String.format("Test stream %s", writerType), testStream); + + PCollection records = + kvs.apply( + String.format("To Record %s", writerType), + MapElements.into(TypeDescriptor.of(Solace.Record.class)) + .via(kv -> SolaceDataUtils.getSolaceRecord(kv.getValue(), kv.getKey()))); + + SolaceOutput result = + records.apply( + String.format("Write to Solace %s", writerType), + SolaceIO.write() + .to(Solace.Topic.fromName(TOPIC_NAME)) + .withSubmissionMode(SolaceIO.SubmissionMode.TESTING) + .withWriterType(writerType) + .withDeliveryMode(DeliveryMode.PERSISTENT) + .withNumberOfClientsPerWorker(1) + .withNumShards(1) + .withSessionServiceFactory( + BasicAuthJcsmpSessionServiceFactory.builder() + .host("localhost:" + solaceContainerManager.jcsmpPortMapped) + .username(SolaceContainerManager.USERNAME) + .password(SolaceContainerManager.PASSWORD) + .vpnName(SolaceContainerManager.VPN_NAME) + .build())); + result + .getSuccessfulPublish() + .apply( + String.format("Get ids %s", writerType), + MapElements.into(strings()).via(Solace.PublishResult::getMessageId)); + + return pipeline; + } + + private static class CountingFn extends DoFn { + + private final Counter elementCounter; + + CountingFn(String namespace, String name) { + elementCounter = Metrics.counter(namespace, name); + } + + @ProcessElement + public void processElement(@Element T record, OutputReceiver c) { + elementCounter.inc(1L); + c.output(record); + } + } +} diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOIT.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOIT.java index ee5d206533dc..8311a67bf6c1 100644 --- a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOIT.java +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOIT.java @@ -108,6 +108,7 @@ public void test02Read() { .from(Queue.fromName(queueName)) .withDeduplicateRecords(true) .withMaxNumConnections(1) + .withDeduplicateRecords(true) .withSempClientFactory( BasicAuthSempClientFactory.builder() .host("http://localhost:" + solaceContainerManager.sempPortMapped) From 06644ece5889e4b68cd28643b2987f645e1ae025 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 10:28:33 -0500 Subject: [PATCH 27/43] Bump com.gradle.develocity from 3.17.6 to 3.19 (#33375) Bumps com.gradle.develocity from 3.17.6 to 3.19. --- updated-dependencies: - dependency-name: com.gradle.develocity dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- settings.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/settings.gradle.kts b/settings.gradle.kts index 6cce1ec0a506..f78cf0517906 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -24,7 +24,7 @@ pluginManagement { } plugins { - id("com.gradle.develocity") version "3.17.6" + id("com.gradle.develocity") version "3.19" id("com.gradle.common-custom-user-data-gradle-plugin") version "2.0.1" } From 39850bf1892b547e603f72a38a268c31e922858b Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 7 Jan 2025 10:37:41 -0500 Subject: [PATCH 28/43] Clean up py38 targets (#33510) * Clean up post py38 TODOs --- .test-infra/jenkins/build.gradle | 4 ++-- .test-infra/jenkins/metrics_report/tox.ini | 4 ++-- .../apache_beam/testing/synthetic_pipeline.py | 17 +++-------------- sdks/python/test-suites/tox/common.gradle | 2 +- sdks/python/tox.ini | 2 +- settings.gradle.kts | 5 ----- 6 files changed, 9 insertions(+), 25 deletions(-) diff --git a/.test-infra/jenkins/build.gradle b/.test-infra/jenkins/build.gradle index df43717e2fc3..862e8756c6d7 100644 --- a/.test-infra/jenkins/build.gradle +++ b/.test-infra/jenkins/build.gradle @@ -36,11 +36,11 @@ task generateMetricsReport { doLast { exec { executable 'sh' - args '-c', ". ${envdir}/bin/activate && tox -e py38-test -c ${toxConfigFilePath}" + args '-c', ". ${envdir}/bin/activate && tox -e py39-test -c ${toxConfigFilePath}" } exec { executable 'sh' - args '-c', ". ${envdir}/bin/activate && tox -e py38-generate-report -c ${toxConfigFilePath} -- --influx-db=${influxDb} --influx-host=${influxHost} --influx-port=${influxPort} --output-file=${generateMetricsReportPath}" + args '-c', ". ${envdir}/bin/activate && tox -e py39-generate-report -c ${toxConfigFilePath} -- --influx-db=${influxDb} --influx-host=${influxHost} --influx-port=${influxPort} --output-file=${generateMetricsReportPath}" } logger.info('Create metrics report file {}', generateMetricsReportPath) } diff --git a/.test-infra/jenkins/metrics_report/tox.ini b/.test-infra/jenkins/metrics_report/tox.ini index d143a0dcf59c..5126b337afcc 100644 --- a/.test-infra/jenkins/metrics_report/tox.ini +++ b/.test-infra/jenkins/metrics_report/tox.ini @@ -25,12 +25,12 @@ commands_pre = pip --version pip check -[testenv:py38-test] +[testenv:py39-test] deps = -r requirements.txt passenv = WORKSPACE,INFLUXDB_USER,INFLUXDB_USER_PASSWORD commands = python -m unittest dashboards_parser.py -[testenv:py38-generate-report] +[testenv:py39-generate-report] deps = -r requirements.txt passenv = WORKSPACE,INFLUXDB_USER,INFLUXDB_USER_PASSWORD,GITHUB_WORKSPACE commands = python report_generator.py {posargs} diff --git a/sdks/python/apache_beam/testing/synthetic_pipeline.py b/sdks/python/apache_beam/testing/synthetic_pipeline.py index 4e3a1f48554d..b18de244e3f8 100644 --- a/sdks/python/apache_beam/testing/synthetic_pipeline.py +++ b/sdks/python/apache_beam/testing/synthetic_pipeline.py @@ -39,7 +39,6 @@ import logging import math import os -import sys import time from random import Random from typing import Optional @@ -81,15 +80,6 @@ class _Random(Random): # for compatibility reasons. random_sample = Random.random - # TODO(yathu) just use builtin rand_bytes when drop py38 support - def rand_bytes(self, length): - """Returns random bytes. - - Args: - length (int): Number of random bytes. - """ - return self.getrandbits(length * 8).to_bytes(length, sys.byteorder) - def get_generator(seed: Optional[int] = None, algorithm: Optional[str] = None): if algorithm is None or algorithm == 'builtin': @@ -478,10 +468,10 @@ def _gen_kv_pair(self, generator, index): # with equal probability. generator_hot = get_generator( seed=index % self._num_hot_keys, algorithm=self.gen_algo) - bytes_ = generator_hot.rand_bytes(self._key_size), generator.rand_bytes( + bytes_ = generator_hot.randbytes(self._key_size), generator.randbytes( self._value_size) else: - bytes_ = generator.rand_bytes(self.element_size) + bytes_ = generator.randbytes(self.element_size) bytes_ = bytes_[:self._key_size], bytes_[self._key_size:] return bytes_ @@ -611,8 +601,7 @@ def process( r = get_generator(algorithm=element.get('algorithm', None), seed=cur) time.sleep(element['sleep_per_input_record_sec']) yield ( - r.rand_bytes(element['key_size']), - r.rand_bytes(element['value_size'])) + r.randbytes(element['key_size']), r.randbytes(element['value_size'])) cur += 1 diff --git a/sdks/python/test-suites/tox/common.gradle b/sdks/python/test-suites/tox/common.gradle index 01265a6eeff5..75a12cdcf4cb 100644 --- a/sdks/python/test-suites/tox/common.gradle +++ b/sdks/python/test-suites/tox/common.gradle @@ -32,7 +32,7 @@ test.dependsOn "testPy${pythonVersionSuffix}ML" // toxTask "testPy${pythonVersionSuffix}Dask", "py${pythonVersionSuffix}-dask", "${posargs}" // test.dependsOn "testPy${pythonVersionSuffix}Dask" project.tasks.register("preCommitPy${pythonVersionSuffix}") { - // Since codecoverage reports will always be generated for py38, + // Since codecoverage reports will always be generated for py39, // all tests will be exercised. // dependsOn = ["testPy${pythonVersionSuffix}Cloud", "testPython${pythonVersionSuffix}"] dependsOn = ["testPy${pythonVersionSuffix}Cloud", "testPython${pythonVersionSuffix}"] diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index 68ac15ced70d..ed1f723d6d4d 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -396,7 +396,7 @@ commands = # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_pytorch {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' -# TODO(https://github.com/apache/beam/issues/25796) - uncomment onnx tox task in tox/py38/build.gradle once onnx supports protobuf 4.x.x +# TODO(https://github.com/apache/beam/issues/25796) - uncomment onnx tox task in tox/py39/build.gradle once onnx supports protobuf 4.x.x [testenv:py{39,310}-onnx-113] # TODO(https://github.com/apache/beam/issues/25443) # apparently tox has problem when substitution key has single value. Change back to -onnx-{113,...} diff --git a/settings.gradle.kts b/settings.gradle.kts index f78cf0517906..67ba773d4070 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -278,32 +278,27 @@ include(":sdks:python") include(":sdks:python:apache_beam:testing:load_tests") include(":sdks:python:apache_beam:testing:benchmarks:nexmark") include(":sdks:python:container") -include(":sdks:python:container:py38") include(":sdks:python:container:py39") include(":sdks:python:container:py310") include(":sdks:python:container:py311") include(":sdks:python:container:py312") include(":sdks:python:expansion-service-container") include(":sdks:python:test-suites:dataflow") -include(":sdks:python:test-suites:dataflow:py38") include(":sdks:python:test-suites:dataflow:py39") include(":sdks:python:test-suites:dataflow:py310") include(":sdks:python:test-suites:dataflow:py311") include(":sdks:python:test-suites:dataflow:py312") include(":sdks:python:test-suites:direct") -include(":sdks:python:test-suites:direct:py38") include(":sdks:python:test-suites:direct:py39") include(":sdks:python:test-suites:direct:py310") include(":sdks:python:test-suites:direct:py311") include(":sdks:python:test-suites:direct:py312") include(":sdks:python:test-suites:direct:xlang") -include(":sdks:python:test-suites:portable:py38") include(":sdks:python:test-suites:portable:py39") include(":sdks:python:test-suites:portable:py310") include(":sdks:python:test-suites:portable:py311") include(":sdks:python:test-suites:portable:py312") include(":sdks:python:test-suites:tox:pycommon") -include(":sdks:python:test-suites:tox:py38") include(":sdks:python:test-suites:tox:py39") include(":sdks:python:test-suites:tox:py310") include(":sdks:python:test-suites:tox:py311") From 1b78b67b29eaaa015fa6115c1cb4b17a6deb6d8b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 11:02:10 -0500 Subject: [PATCH 29/43] Bump yapf from 0.29.0 to 0.43.0 in /sdks/python (#33112) Bumps [yapf](https://github.com/google/yapf) from 0.29.0 to 0.43.0. - [Changelog](https://github.com/google/yapf/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/yapf/compare/v0.29.0...v0.43.0) --- updated-dependencies: - dependency-name: yapf dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index 8000c24f28aa..633a8511a19a 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -34,7 +34,7 @@ requires = [ 'pyyaml>=3.12,<7.0.0', # also update Jinja2 bounds in test-suites/xlang/build.gradle (look for xlangWrapperValidation task) "jinja2>=2.7.1,<4.0.0", - 'yapf==0.29.0' + 'yapf==0.43.0' ] From 4a5575cc149a568ec325e14a551ca485fcabf144 Mon Sep 17 00:00:00 2001 From: Jack McCluskey <34928439+jrmccluskey@users.noreply.github.com> Date: Tue, 7 Jan 2025 12:54:15 -0500 Subject: [PATCH 30/43] Update typehinting code to use primitives and collections types as main types over typing variants (#33427) * Refactor: Add convert_collections_from_typing() Added to convert typing module collections to built-ins. This function effectively reverses the operation of the function. Includes comprehensive unit tests to verify the correct conversion of various typing collections to their builtin counterparts, including nested structures and type variables. * Flip paradigm for convert_to_beam_type to be primative and collections-centric * update comment * fix clobbered import from merge * formatting * fix imports * address comments * remove extra import artifacts from merge --------- Co-authored-by: labs-code-app[bot] <161369871+labs-code-app[bot]@users.noreply.github.com> --- .../typehints/native_type_compatibility.py | 107 ++++++++++++++---- .../native_type_compatibility_test.py | 27 ++++- 2 files changed, 107 insertions(+), 27 deletions(-) diff --git a/sdks/python/apache_beam/typehints/native_type_compatibility.py b/sdks/python/apache_beam/typehints/native_type_compatibility.py index 6f704b37a969..3f57a573b505 100644 --- a/sdks/python/apache_beam/typehints/native_type_compatibility.py +++ b/sdks/python/apache_beam/typehints/native_type_compatibility.py @@ -20,6 +20,7 @@ # pytype: skip-file import collections +import collections.abc import logging import sys import types @@ -45,7 +46,18 @@ frozenset: typing.FrozenSet, } +_BUILTINS = [ + dict, + list, + tuple, + set, + frozenset, +] + _CONVERTED_COLLECTIONS = [ + collections.abc.Iterable, + collections.abc.Iterator, + collections.abc.Generator, collections.abc.Set, collections.abc.MutableSet, collections.abc.Collection, @@ -99,6 +111,17 @@ def _match_issubclass(match_against): return lambda user_type: _safe_issubclass(user_type, match_against) +def _is_primitive(user_type, primitive): + # catch bare primitives + if user_type is primitive: + return True + return getattr(user_type, '__origin__', None) is primitive + + +def _match_is_primitive(match_against): + return lambda user_type: _is_primitive(user_type, match_against) + + def _match_is_exactly_mapping(user_type): # Avoid unintentionally catching all subtypes (e.g. strings and mappings). expected_origin = collections.abc.Mapping @@ -106,7 +129,7 @@ def _match_is_exactly_mapping(user_type): def _match_is_exactly_iterable(user_type): - if user_type is typing.Iterable: + if user_type is typing.Iterable or user_type is collections.abc.Iterable: return True # Avoid unintentionally catching all subtypes (e.g. strings and mappings). expected_origin = collections.abc.Iterable @@ -152,11 +175,13 @@ def _match_is_union(user_type): return False -def match_is_set(user_type): - if _safe_issubclass(user_type, typing.Set): +def _match_is_set(user_type): + if _safe_issubclass(user_type, typing.Set) or _is_primitive(user_type, set): return True elif getattr(user_type, '__origin__', None) is not None: - return _safe_issubclass(user_type.__origin__, collections.abc.Set) + return _safe_issubclass( + user_type.__origin__, collections.abc.Set) or _safe_issubclass( + user_type.__origin__, collections.abc.MutableSet) else: return False @@ -197,6 +222,36 @@ def convert_builtin_to_typing(typ): return typ +def convert_typing_to_builtin(typ): + """Converts a given typing collections type to its builtin counterpart. + + Args: + typ: A typing type (e.g., typing.List[int]). + + Returns: + type: The corresponding builtin type (e.g., list[int]). + """ + origin = getattr(typ, '__origin__', None) + args = getattr(typ, '__args__', None) + # Typing types return the primitive type as the origin from 3.9 on + if origin not in _BUILTINS: + return typ + # Early return for bare types + if not args: + return origin + if origin is list: + return list[convert_typing_to_builtin(args[0])] + elif origin is dict: + return dict[convert_typing_to_builtin(args[0]), + convert_typing_to_builtin(args[1])] + elif origin is tuple: + return tuple[tuple(convert_typing_to_builtin(args))] + elif origin is set: + return set[convert_typing_to_builtin(args)] + elif origin is frozenset: + return frozenset[convert_typing_to_builtin(args)] + + def convert_collections_to_typing(typ): """Converts a given collections.abc type to a typing object. @@ -216,6 +271,12 @@ def convert_collections_to_typing(typ): return typ +def is_builtin(typ): + if typ in _BUILTINS: + return True + return getattr(typ, '__origin__', None) in _BUILTINS + + def convert_to_beam_type(typ): """Convert a given typing type to a Beam type. @@ -238,11 +299,8 @@ def convert_to_beam_type(typ): sys.version_info.minor >= 10) and (isinstance(typ, types.UnionType)): typ = typing.Union[typ] - if isinstance(typ, types.GenericAlias): - typ = convert_builtin_to_typing(typ) - - if getattr(typ, '__module__', None) == 'collections.abc': - typ = convert_collections_to_typing(typ) + if getattr(typ, '__module__', None) == 'typing': + typ = convert_typing_to_builtin(typ) typ_module = getattr(typ, '__module__', None) if isinstance(typ, typing.TypeVar): @@ -267,8 +325,16 @@ def convert_to_beam_type(typ): # TODO(https://github.com/apache/beam/issues/20076): Currently unhandled. _LOGGER.info('Converting NewType type hint to Any: "%s"', typ) return typehints.Any - elif (typ_module != 'typing') and (typ_module != 'collections.abc'): - # Only translate types from the typing and collections.abc modules. + elif typ_module == 'apache_beam.typehints.native_type_compatibility' and \ + getattr(typ, "__name__", typ.__origin__.__name__) == 'TypedWindowedValue': + # Need to pass through WindowedValue class so that it can be converted + # to the correct type constraint in Beam + # This is needed to fix https://github.com/apache/beam/issues/33356 + pass + + elif (typ_module != 'typing') and (typ_module != + 'collections.abc') and not is_builtin(typ): + # Only translate primitives and types from collections.abc and typing. return typ if (typ_module == 'collections.abc' and typ.__origin__ not in _CONVERTED_COLLECTIONS): @@ -285,39 +351,34 @@ def convert_to_beam_type(typ): _TypeMapEntry(match=is_forward_ref, arity=0, beam_type=typehints.Any), _TypeMapEntry(match=is_any, arity=0, beam_type=typehints.Any), _TypeMapEntry( - match=_match_issubclass(typing.Dict), - arity=2, - beam_type=typehints.Dict), + match=_match_is_primitive(dict), arity=2, beam_type=typehints.Dict), _TypeMapEntry( match=_match_is_exactly_iterable, arity=1, beam_type=typehints.Iterable), _TypeMapEntry( - match=_match_issubclass(typing.List), - arity=1, - beam_type=typehints.List), + match=_match_is_primitive(list), arity=1, beam_type=typehints.List), # FrozenSets are a specific instance of a set, so we check this first. _TypeMapEntry( - match=_match_issubclass(typing.FrozenSet), + match=_match_is_primitive(frozenset), arity=1, beam_type=typehints.FrozenSet), - _TypeMapEntry(match=match_is_set, arity=1, beam_type=typehints.Set), + _TypeMapEntry(match=_match_is_set, arity=1, beam_type=typehints.Set), # NamedTuple is a subclass of Tuple, but it needs special handling. # We just convert it to Any for now. # This MUST appear before the entry for the normal Tuple. _TypeMapEntry( match=match_is_named_tuple, arity=0, beam_type=typehints.Any), _TypeMapEntry( - match=_match_issubclass(typing.Tuple), - arity=-1, + match=_match_is_primitive(tuple), arity=-1, beam_type=typehints.Tuple), _TypeMapEntry(match=_match_is_union, arity=-1, beam_type=typehints.Union), _TypeMapEntry( - match=_match_issubclass(typing.Generator), + match=_match_issubclass(collections.abc.Generator), arity=3, beam_type=typehints.Generator), _TypeMapEntry( - match=_match_issubclass(typing.Iterator), + match=_match_issubclass(collections.abc.Iterator), arity=1, beam_type=typehints.Iterator), _TypeMapEntry( diff --git a/sdks/python/apache_beam/typehints/native_type_compatibility_test.py b/sdks/python/apache_beam/typehints/native_type_compatibility_test.py index ae8e1a0b2906..15b5da99fb0c 100644 --- a/sdks/python/apache_beam/typehints/native_type_compatibility_test.py +++ b/sdks/python/apache_beam/typehints/native_type_compatibility_test.py @@ -30,6 +30,7 @@ from apache_beam.typehints.native_type_compatibility import convert_to_beam_types from apache_beam.typehints.native_type_compatibility import convert_to_typing_type from apache_beam.typehints.native_type_compatibility import convert_to_typing_types +from apache_beam.typehints.native_type_compatibility import convert_typing_to_builtin from apache_beam.typehints.native_type_compatibility import is_any _TestNamedTuple = typing.NamedTuple( @@ -43,6 +44,7 @@ class _TestClass(object): T = typing.TypeVar('T') +U = typing.TypeVar('U') class _TestGeneric(typing.Generic[T]): @@ -140,7 +142,7 @@ def test_convert_to_beam_type_with_builtin_types(self): ( 'builtin nested tuple', tuple[str, list], - typehints.Tuple[str, typehints.List[typehints.Any]], + typehints.Tuple[str, typehints.List[typehints.TypeVariable('T')]], ) ] @@ -159,7 +161,7 @@ def test_convert_to_beam_type_with_collections_types(self): typehints.Iterable[int]), ( 'collection generator', - collections.abc.Generator[int], + collections.abc.Generator[int, None, None], typehints.Generator[int]), ( 'collection iterator', @@ -177,9 +179,8 @@ def test_convert_to_beam_type_with_collections_types(self): 'mapping not caught', collections.abc.Mapping[str, int], collections.abc.Mapping[str, int]), - ('set', collections.abc.Set[str], typehints.Set[str]), + ('set', collections.abc.Set[int], typehints.Set[int]), ('mutable set', collections.abc.MutableSet[int], typehints.Set[int]), - ('enum set', collections.abc.Set[_TestEnum], typehints.Set[_TestEnum]), ( 'enum mutable set', collections.abc.MutableSet[_TestEnum], @@ -337,6 +338,24 @@ def test_is_any(self): for expected, typ in test_cases: self.assertEqual(expected, is_any(typ), msg='%s' % typ) + def test_convert_typing_to_builtin(self): + test_cases = [ + ('list', typing.List[int], + list[int]), ('dict', typing.Dict[str, int], dict[str, int]), + ('tuple', typing.Tuple[str, int], tuple[str, int]), + ('set', typing.Set[str], set[str]), + ('frozenset', typing.FrozenSet[int], frozenset[int]), + ( + 'nested', + typing.List[typing.Dict[str, typing.Tuple[int]]], + list[dict[str, tuple[int]]]), ('typevar', typing.List[T], list[T]), + ('nested_typevar', typing.Dict[T, typing.List[U]], dict[T, list[U]]) + ] + + for description, typing_type, expected_builtin_type in test_cases: + builtin_type = convert_typing_to_builtin(typing_type) + self.assertEqual(builtin_type, expected_builtin_type, description) + if __name__ == '__main__': unittest.main() From b911ccafe66eee9eb1822d3b82d0b6d13ae0917c Mon Sep 17 00:00:00 2001 From: Chamikara Jayalath Date: Tue, 7 Jan 2025 12:36:49 -0800 Subject: [PATCH 31/43] Documents the connectors supported via the Managed API (#33516) * Documents the connectors supported via the Managed API * Corrects a row and adjusts title text --- .../content/en/documentation/io/connectors.md | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/website/www/site/content/en/documentation/io/connectors.md b/website/www/site/content/en/documentation/io/connectors.md index 313f72ce622a..aa7a7c0f4b51 100644 --- a/website/www/site/content/en/documentation/io/connectors.md +++ b/website/www/site/content/en/documentation/io/connectors.md @@ -37,6 +37,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Typescript Batch Supported Streaming Supported + Supported via Managed API FileIO @@ -57,6 +58,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ AvroIO @@ -80,6 +82,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✔ + ✘ TextIO (metrics) @@ -103,6 +106,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✔ + ✘ TFRecordIO @@ -120,6 +124,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ XmlIO @@ -134,6 +139,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ TikaIO @@ -148,6 +154,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ ParquetIO (guide) @@ -171,6 +178,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✘ + ✘ ThriftIO @@ -185,6 +193,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ HadoopFileSystem @@ -204,6 +213,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✘ + ✘ GcsFileSystem (metrics) @@ -226,6 +236,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✘ + ✘ LocalFileSystem @@ -248,6 +259,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✘ + ✘ S3FileSystem @@ -267,6 +279,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✘ + ✘ In-memory @@ -281,6 +294,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✘ ✔ ✘ + ✘ KinesisIO @@ -298,6 +312,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ AmqpIO @@ -312,6 +327,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ KafkaIO @@ -335,6 +351,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✔ + ✔ PubSubIO @@ -358,6 +375,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✔ + ✘ JmsIO @@ -372,6 +390,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ MqttIO @@ -386,6 +405,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ RabbitMqIO @@ -400,6 +420,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ SqsIO @@ -414,6 +435,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ SnsIO @@ -428,6 +450,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ CassandraIO @@ -442,6 +465,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ HadoopFormatIO (guide) @@ -456,6 +480,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ HBaseIO @@ -470,6 +495,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ HCatalogIO (guide) @@ -484,6 +510,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ KuduIO @@ -498,6 +525,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ SolrIO @@ -512,6 +540,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ ElasticsearchIO @@ -526,6 +555,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ BigQueryIO (guide) (metrics) @@ -552,6 +582,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✔ + ✔ BigTableIO (metrics) @@ -578,6 +609,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ DatastoreIO @@ -598,6 +630,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ SnowflakeIO (guide) @@ -615,6 +648,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ SpannerIO @@ -635,6 +669,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ JdbcIO @@ -655,6 +690,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ DebeziumIO @@ -675,6 +711,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ MongoDbIO @@ -695,6 +732,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ MongoDbGridFSIO @@ -709,6 +747,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ RedisIO @@ -723,6 +762,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ DynamoDBIO @@ -737,6 +777,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ ClickHouseIO @@ -751,6 +792,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ DatabaseIO @@ -765,6 +807,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ GenerateSequence @@ -779,6 +822,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ SplunkIO @@ -793,6 +837,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ FhirIO @@ -810,6 +855,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ HL7v2IO @@ -824,6 +870,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ DicomIO @@ -841,6 +888,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ @@ -857,6 +905,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ Firestore IO @@ -871,6 +920,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ Neo4j @@ -885,6 +935,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ Pub/Sub Lite @@ -905,6 +956,7 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✔ + ✘ InfluxDB @@ -919,6 +971,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ SparkReceiverIO (guide) @@ -933,6 +986,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✘ ✔ + ✘ CdapIO (guide) @@ -947,6 +1001,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ SingleStoreDB (guide) @@ -961,6 +1016,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✘ + ✘ GoogleAdsIO @@ -975,6 +1031,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ @@ -988,6 +1045,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✘ @@ -1001,6 +1059,7 @@ This table provides a consolidated, at-a-glance overview of the available built- Not available ✔ ✔ + ✔ From ab354dbe0bea397b63058430f5e6218563a33711 Mon Sep 17 00:00:00 2001 From: liferoad Date: Tue, 7 Jan 2025 15:48:45 -0500 Subject: [PATCH 32/43] Update REVIEWERS.yml with the BigTable team (#33492) --- .github/REVIEWERS.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/REVIEWERS.yml b/.github/REVIEWERS.yml index dba969180c45..882a9f19ce94 100644 --- a/.github/REVIEWERS.yml +++ b/.github/REVIEWERS.yml @@ -56,6 +56,9 @@ labels: reviewers: - igorbernstein2 - mutianf + - djyau + - andre-sampaio + - meeral-k exclusionList: [] - name: healthcare reviewers: From fa086f99e24d320f69f7901b2e4aa55060f559ec Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Tue, 7 Jan 2025 15:58:07 -0500 Subject: [PATCH 33/43] Pin protobuf for older hadoop tests (#33525) * Pin protobuf for older hadoop tests * trigger postcommit --- .../trigger_files/beam_PostCommit_Java_Hadoop_Versions.json | 2 +- sdks/java/io/hadoop-file-system/build.gradle | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/trigger_files/beam_PostCommit_Java_Hadoop_Versions.json b/.github/trigger_files/beam_PostCommit_Java_Hadoop_Versions.json index 8784d0786c02..53d94cfc4f1e 100644 --- a/.github/trigger_files/beam_PostCommit_Java_Hadoop_Versions.json +++ b/.github/trigger_files/beam_PostCommit_Java_Hadoop_Versions.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 2 + "modification": 3 } \ No newline at end of file diff --git a/sdks/java/io/hadoop-file-system/build.gradle b/sdks/java/io/hadoop-file-system/build.gradle index b4ebbfa08c5e..d78d6a300cac 100644 --- a/sdks/java/io/hadoop-file-system/build.gradle +++ b/sdks/java/io/hadoop-file-system/build.gradle @@ -72,6 +72,11 @@ hadoopVersions.each {kv -> force "org.apache.hadoop:hadoop-hdfs:$kv.value:tests" force "org.apache.hadoop:hadoop-hdfs-client:$kv.value:tests" force library.java.slf4j_jdk14 + if ("$kv.value" == "2.10.2" || "$kv.value" == "3.2.4") { + // Older Hadoop only works with older protobuf + force "com.google.protobuf:protobuf-java:3.25.5" + force "com.google.protobuf:protobuf-java-util:3.25.5" + } } } } From 6618968a2f7a0548915161259dfc2dd9bdb002b5 Mon Sep 17 00:00:00 2001 From: Dmitry Labutin Date: Tue, 7 Jan 2025 13:00:49 -0800 Subject: [PATCH 34/43] Return zero elements immediately if the requested number of quantiles is 1. (#33524) --- sdks/go/pkg/beam/transforms/stats/quantiles.go | 4 ++++ .../pkg/beam/transforms/stats/quantiles_test.go | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/sdks/go/pkg/beam/transforms/stats/quantiles.go b/sdks/go/pkg/beam/transforms/stats/quantiles.go index 6d2baa8b5e99..93ca4e3ee525 100644 --- a/sdks/go/pkg/beam/transforms/stats/quantiles.go +++ b/sdks/go/pkg/beam/transforms/stats/quantiles.go @@ -701,6 +701,10 @@ func reduce(s beam.Scope, weightedElements beam.PCollection, state approximateQu // For example, if numQuantiles = 2, the returned list would contain a single element such that approximately half of the input would be less than that element and half would be greater or equal. func ApproximateWeightedQuantiles(s beam.Scope, pc beam.PCollection, less any, opts Opts) beam.PCollection { _, t := beam.ValidateKVType(pc) + // Return zero elements immediately if the requested number of quantiles is 1. + if opts.NumQuantiles == 1 { + return beam.Create(s, reflect.New(reflect.SliceOf(t.Type())).Elem().Interface()) + } state := approximateQuantilesCombineFnState{ K: opts.K, NumQuantiles: opts.NumQuantiles, diff --git a/sdks/go/pkg/beam/transforms/stats/quantiles_test.go b/sdks/go/pkg/beam/transforms/stats/quantiles_test.go index 1e389eed128b..bad5fce30625 100644 --- a/sdks/go/pkg/beam/transforms/stats/quantiles_test.go +++ b/sdks/go/pkg/beam/transforms/stats/quantiles_test.go @@ -246,3 +246,20 @@ func TestWeightedElementEncoding(t *testing.T) { t.Errorf("Invalid coder. Wanted %v got %v", w, decoded) } } + +func TestZeroQuantiles(t *testing.T) { + const numElements int = 30000 + inputSlice := make([]int, 0, numElements) + for i := 0; i < numElements; i++ { + inputSlice = append(inputSlice, i) + } + p, s, input, expected := ptest.CreateList2(inputSlice, [][]int{{}}) + quantiles := ApproximateQuantiles(s, input, less, Opts{ + K: 200, + NumQuantiles: 1, + }) + passert.Equals(s, quantiles, expected) + if err := ptest.Run(p); err != nil { + t.Errorf("ApproximateQuantiles failed: %v", err) + } +} From 6509e5140339b75105d150d09a19ecfb09255ff3 Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Tue, 7 Jan 2025 13:01:56 -0800 Subject: [PATCH 35/43] [#33513][prism]Handle Time sorted requirement and drop late data. (#33515) --- runners/prism/java/build.gradle | 16 +++++++++------- .../prism/internal/engine/elementmanager.go | 12 ++++++++++++ .../beam/runners/prism/internal/handlepardo.go | 7 +++++-- .../runners/prism/internal/jobservices/job.go | 1 + .../apache/beam/sdk/transforms/ParDoTest.java | 8 ++++++-- 5 files changed, 33 insertions(+), 11 deletions(-) diff --git a/runners/prism/java/build.gradle b/runners/prism/java/build.gradle index a48973f65674..791952c625f4 100644 --- a/runners/prism/java/build.gradle +++ b/runners/prism/java/build.gradle @@ -98,6 +98,7 @@ def sickbayTests = [ 'org.apache.beam.sdk.transforms.ViewTest.testTriggeredLatestSingleton', // Requires Allowed Lateness, among others. 'org.apache.beam.sdk.transforms.ParDoTest$TimerTests.testEventTimeTimerSetWithinAllowedLateness', + 'org.apache.beam.sdk.transforms.ParDoTest$StateTests.testRequiresTimeSortedInputWithLateDataAndAllowedLateness', 'org.apache.beam.sdk.testing.TestStreamTest.testFirstElementLate', 'org.apache.beam.sdk.testing.TestStreamTest.testDiscardingMode', 'org.apache.beam.sdk.testing.TestStreamTest.testEarlyPanesOfWindow', @@ -160,6 +161,14 @@ def sickbayTests = [ // TODO(https://github.com/apache/beam/issues/31231) 'org.apache.beam.sdk.transforms.RedistributeTest.testRedistributePreservesMetadata', + + // These tests fail once Late Data was being precisely dropped. + // They set a single element to be late data, and expect it (correctly) to be preserved. + // Since presently, these are treated as No-ops, the fix is to disable the + // dropping behavior when a stage's input is a Reshuffle/Redistribute transform. + 'org.apache.beam.sdk.transforms.ReshuffleTest.testReshuffleWithTimestampsStreaming', + 'org.apache.beam.sdk.transforms.RedistributeTest.testRedistributeWithTimestampsStreaming', + // Prism isn't handling Java's side input views properly. // https://github.com/apache/beam/issues/32932 // java.lang.IllegalArgumentException: PCollection with more than one element accessed as a singleton view. @@ -177,13 +186,6 @@ def sickbayTests = [ // java.lang.IllegalStateException: java.io.EOFException 'org.apache.beam.sdk.transforms.ViewTest.testSideInputWithNestedIterables', - // Requires Time Sorted Input - 'org.apache.beam.sdk.transforms.ParDoTest$StateTests.testRequiresTimeSortedInput', - 'org.apache.beam.sdk.transforms.ParDoTest$StateTests.testRequiresTimeSortedInputWithTestStream', - 'org.apache.beam.sdk.transforms.ParDoTest$StateTests.testRequiresTimeSortedInputWithLateDataAndAllowedLateness', - 'org.apache.beam.sdk.transforms.ParDoTest$StateTests.testTwoRequiresTimeSortedInputWithLateData', - 'org.apache.beam.sdk.transforms.ParDoTest$StateTests.testRequiresTimeSortedInputWithLateData', - // Missing output due to processing time timer skew. 'org.apache.beam.sdk.transforms.ParDoTest$TimestampTests.testProcessElementSkew', diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go index 3cfcf9ef8c0e..bb3c8ceceeb8 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go @@ -1179,6 +1179,18 @@ func makeStageState(ID string, inputIDs, outputIDs []string, sides []LinkID) *st func (ss *stageState) AddPending(newPending []element) int { ss.mu.Lock() defer ss.mu.Unlock() + // TODO(#https://github.com/apache/beam/issues/31438): + // Adjust with AllowedLateness + // Data that arrives after the *output* watermark is late. + threshold := ss.output + origPending := make([]element, 0, ss.pending.Len()) + for _, e := range newPending { + if e.window.MaxTimestamp() < threshold { + continue + } + origPending = append(origPending, e) + } + newPending = origPending if ss.stateful { if ss.pendingByKeys == nil { ss.pendingByKeys = map[string]*dataAndTimers{} diff --git a/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go b/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go index 13e9b6f1b79d..7ac472251f61 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go +++ b/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go @@ -84,9 +84,12 @@ func (h *pardo) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb // At their simplest, we don't need to do anything special at pre-processing time, and simply pass through as normal. - // StatefulDoFns need to be marked as being roots. + // ForceRoots cause fusion breaks in the optimized graph. + // StatefulDoFns need to be marked as being roots, for correct per-key state handling. + // Prism already sorts input elements for a stage by EventTime, so a fusion break enables the sorted behavior. var forcedRoots []string - if len(pdo.StateSpecs)+len(pdo.TimerFamilySpecs) > 0 { + if len(pdo.GetStateSpecs())+len(pdo.GetTimerFamilySpecs()) > 0 || + pdo.GetRequiresTimeSortedInput() { forcedRoots = append(forcedRoots, tid) } diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go index 4be64e5a9c80..f186b11fd1d8 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go @@ -47,6 +47,7 @@ var supportedRequirements = map[string]struct{}{ urns.RequirementStatefulProcessing: {}, urns.RequirementBundleFinalization: {}, urns.RequirementOnWindowExpiration: {}, + urns.RequirementTimeSortedInput: {}, } // TODO, move back to main package, and key off of executor handlers? diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java index 742547b9b6c3..8409133772eb 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/ParDoTest.java @@ -3764,7 +3764,9 @@ public void testRequiresTimeSortedInputWithLateData() { if (stamp == 100) { // advance watermark when we have 100 remaining elements // all the rest are going to be late elements - input = input.advanceWatermarkTo(Instant.ofEpochMilli(stamp)); + input = + input.advanceWatermarkTo( + GlobalWindow.INSTANCE.maxTimestamp().plus(Duration.standardSeconds(1))); } } testTimeSortedInput( @@ -3796,7 +3798,9 @@ public void testTwoRequiresTimeSortedInputWithLateData() { if (stamp == 100) { // advance watermark when we have 100 remaining elements // all the rest are going to be late elements - input = input.advanceWatermarkTo(Instant.ofEpochMilli(stamp)); + input = + input.advanceWatermarkTo( + GlobalWindow.INSTANCE.maxTimestamp().plus(Duration.standardSeconds(1))); } } // apply the sorted function for the first time From 40151abbfdd3b1deb6a7b3de395562de91b818a6 Mon Sep 17 00:00:00 2001 From: Filipe Regadas Date: Tue, 7 Jan 2025 16:31:36 -0500 Subject: [PATCH 36/43] Add Iceberg support for name-based mapping schema (#33315) * Add Iceberg support for name-based mapping schema * Add nullable annotation * Add nested field * iceberg-gcp already as a runtimeOnly * Trigger IT tests --- .../IO_Iceberg_Integration_Tests.json | 2 +- sdks/java/io/iceberg/build.gradle | 2 + .../beam/sdk/io/iceberg/ScanTaskReader.java | 37 +++- .../sdk/io/iceberg/IcebergIOReadTest.java | 202 ++++++++++++++++++ .../sdk/io/iceberg/TestDataWarehouse.java | 4 + .../beam/sdk/io/iceberg/TestFixtures.java | 7 + 6 files changed, 244 insertions(+), 10 deletions(-) diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json index bbdc3a3910ef..62ae7886c573 100644 --- a/.github/trigger_files/IO_Iceberg_Integration_Tests.json +++ b/.github/trigger_files/IO_Iceberg_Integration_Tests.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 3 + "modification": 4 } diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle index f7a9e5c8533d..19fcbb7d1ea0 100644 --- a/sdks/java/io/iceberg/build.gradle +++ b/sdks/java/io/iceberg/build.gradle @@ -65,6 +65,8 @@ dependencies { testImplementation library.java.bigdataoss_gcsio testImplementation library.java.bigdataoss_gcs_connector testImplementation library.java.bigdataoss_util_hadoop + testImplementation "org.apache.parquet:parquet-avro:$parquet_version" + testImplementation "org.apache.parquet:parquet-common:$parquet_version" testImplementation "org.apache.iceberg:iceberg-data:$iceberg_version" testImplementation project(path: ":sdks:java:core", configuration: "shadowTest") testImplementation project(":sdks:java:extensions:google-cloud-platform-core") diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java index 5784dfd79744..c9906618a64d 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java @@ -35,6 +35,7 @@ import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; import org.apache.iceberg.avro.Avro; import org.apache.iceberg.data.IdentityPartitionConverters; import org.apache.iceberg.data.Record; @@ -47,6 +48,7 @@ import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.orc.ORC; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.types.Type; @@ -98,6 +100,8 @@ public boolean advance() throws IOException { // which are not null-safe. @SuppressWarnings("nullness") org.apache.iceberg.@NonNull Schema project = this.project; + @Nullable + String nameMapping = source.getTable().properties().get(TableProperties.DEFAULT_NAME_MAPPING); do { // If our current iterator is working... do that. @@ -129,37 +133,52 @@ public boolean advance() throws IOException { switch (file.format()) { case ORC: LOG.info("Preparing ORC input"); - iterable = + ORC.ReadBuilder orcReader = ORC.read(input) .split(fileTask.start(), fileTask.length()) .project(project) .createReaderFunc( fileSchema -> GenericOrcReader.buildReader(project, fileSchema, idToConstants)) - .filter(fileTask.residual()) - .build(); + .filter(fileTask.residual()); + + if (nameMapping != null) { + orcReader.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + iterable = orcReader.build(); break; case PARQUET: LOG.info("Preparing Parquet input."); - iterable = + Parquet.ReadBuilder parquetReader = Parquet.read(input) .split(fileTask.start(), fileTask.length()) .project(project) .createReaderFunc( fileSchema -> GenericParquetReaders.buildReader(project, fileSchema, idToConstants)) - .filter(fileTask.residual()) - .build(); + .filter(fileTask.residual()); + + if (nameMapping != null) { + parquetReader.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + iterable = parquetReader.build(); break; case AVRO: LOG.info("Preparing Avro input."); - iterable = + Avro.ReadBuilder avroReader = Avro.read(input) .split(fileTask.start(), fileTask.length()) .project(project) .createReaderFunc( - fileSchema -> DataReader.create(project, fileSchema, idToConstants)) - .build(); + fileSchema -> DataReader.create(project, fileSchema, idToConstants)); + + if (nameMapping != null) { + avroReader.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + iterable = avroReader.build(); break; default: throw new UnsupportedOperationException("Cannot read format: " + file.format()); diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java index 6ff3bdf6a4ff..1c3f9b53f31a 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java @@ -21,6 +21,8 @@ import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.containsInAnyOrder; +import java.io.File; +import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.List; @@ -28,6 +30,8 @@ import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; import org.apache.beam.sdk.coders.RowCoder; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.testing.PAssert; @@ -36,13 +40,31 @@ import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.iceberg.CatalogUtil; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.PartitionKey; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.hadoop.HadoopInputFile; +import org.apache.iceberg.mapping.MappedField; +import org.apache.iceberg.mapping.MappedFields; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.mapping.NameMappingParser; +import org.apache.iceberg.parquet.ParquetUtil; import org.apache.iceberg.types.Types; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.ParquetWriter; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; @@ -210,4 +232,184 @@ public void testIdentityColumnScan() throws Exception { testPipeline.run(); } + + @Test + public void testNameMappingScan() throws Exception { + org.apache.avro.Schema metadataSchema = + org.apache.avro.Schema.createRecord( + "metadata", + null, + null, + false, + ImmutableList.of( + new org.apache.avro.Schema.Field( + "source", + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING), + null, + null))); + + org.apache.avro.Schema avroSchema = + org.apache.avro.Schema.createRecord( + "test", + null, + null, + false, + ImmutableList.of( + new org.apache.avro.Schema.Field( + "data", + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING), + null, + null), + new org.apache.avro.Schema.Field( + "id", + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.LONG), + null, + null), + new org.apache.avro.Schema.Field("metadata", metadataSchema, null, null))); + + List> recordData = + ImmutableList.>builder() + .add( + ImmutableMap.of( + "id", + 0L, + "data", + "clarification", + "metadata", + ImmutableMap.of("source", "systemA"))) + .add( + ImmutableMap.of( + "id", 1L, "data", "risky", "metadata", ImmutableMap.of("source", "systemB"))) + .add( + ImmutableMap.of( + "id", 2L, "data", "falafel", "metadata", ImmutableMap.of("source", "systemC"))) + .build(); + + List avroRecords = + recordData.stream() + .map(data -> avroGenericRecord(avroSchema, data)) + .collect(Collectors.toList()); + + Configuration hadoopConf = new Configuration(); + String path = createParquetFile(avroSchema, avroRecords); + HadoopInputFile inputFile = HadoopInputFile.fromLocation(path, hadoopConf); + + NameMapping defaultMapping = + NameMapping.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "metadata", MappedFields.of(MappedField.of(4, "source")))); + ImmutableMap tableProperties = + ImmutableMap.builder() + .put(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(defaultMapping)) + .build(); + + TableIdentifier tableId = + TableIdentifier.of("default", "table" + Long.toString(UUID.randomUUID().hashCode(), 16)); + Table simpleTable = + warehouse + .buildTable(tableId, TestFixtures.NESTED_SCHEMA) + .withProperties(tableProperties) + .withPartitionSpec(PartitionSpec.unpartitioned()) + .create(); + + MetricsConfig metricsConfig = MetricsConfig.forTable(simpleTable); + Metrics metrics = ParquetUtil.fileMetrics(inputFile, metricsConfig); + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFormat(FileFormat.PARQUET) + .withInputFile(inputFile) + .withMetrics(metrics) + .build(); + + final Schema beamSchema = IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.NESTED_SCHEMA); + + simpleTable.newFastAppend().appendFile(dataFile).commit(); + + Map catalogProps = + ImmutableMap.builder() + .put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP) + .put("warehouse", warehouse.location) + .build(); + + IcebergCatalogConfig catalogConfig = + IcebergCatalogConfig.builder() + .setCatalogName("name") + .setCatalogProperties(catalogProps) + .build(); + + PCollection output = + testPipeline + .apply(IcebergIO.readRows(catalogConfig).from(tableId)) + .apply(ParDo.of(new PrintRow())) + .setCoder(RowCoder.of(beamSchema)); + + final Row[] expectedRows = + recordData.stream() + .map(data -> icebergGenericRecord(TestFixtures.NESTED_SCHEMA.asStruct(), data)) + .map(record -> IcebergUtils.icebergRecordToBeamRow(beamSchema, record)) + .toArray(Row[]::new); + + PAssert.that(output) + .satisfies( + (Iterable rows) -> { + assertThat(rows, containsInAnyOrder(expectedRows)); + return null; + }); + + testPipeline.run(); + } + + @SuppressWarnings("unchecked") + public static GenericRecord avroGenericRecord( + org.apache.avro.Schema schema, Map values) { + GenericRecord record = new GenericData.Record(schema); + for (org.apache.avro.Schema.Field field : schema.getFields()) { + Object rawValue = values.get(field.name()); + Object avroValue = + rawValue instanceof Map + ? avroGenericRecord(field.schema(), (Map) rawValue) + : rawValue; + record.put(field.name(), avroValue); + } + return record; + } + + @SuppressWarnings("unchecked") + public static Record icebergGenericRecord(Types.StructType type, Map values) { + org.apache.iceberg.data.GenericRecord record = + org.apache.iceberg.data.GenericRecord.create(type); + for (Types.NestedField field : type.fields()) { + Object rawValue = values.get(field.name()); + Object value = + rawValue instanceof Map + ? icebergGenericRecord(field.type().asStructType(), (Map) rawValue) + : rawValue; + record.setField(field.name(), value); + } + return record; + } + + public static String createParquetFile(org.apache.avro.Schema schema, List records) + throws IOException { + + File tempFile = createTempFile(); + Path file = new Path(tempFile.getPath()); + + AvroParquetWriter.Builder builder = AvroParquetWriter.builder(file); + ParquetWriter parquetWriter = builder.withSchema(schema).build(); + for (GenericRecord record : records) { + parquetWriter.write(record); + } + parquetWriter.close(); + + return tempFile.getPath(); + } + + private static File createTempFile() throws IOException { + File tempFile = File.createTempFile(ScanSourceTest.class.getSimpleName(), ".tmp"); + tempFile.deleteOnExit(); + boolean unused = tempFile.delete(); + return tempFile; + } } diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestDataWarehouse.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestDataWarehouse.java index 9352123b5c77..d7a6cd34838c 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestDataWarehouse.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestDataWarehouse.java @@ -163,6 +163,10 @@ public Table createTable( return catalog.createTable(tableId, schema, partitionSpec); } + public Catalog.TableBuilder buildTable(TableIdentifier tableId, Schema schema) { + return catalog.buildTable(tableId, schema); + } + public Table loadTable(TableIdentifier tableId) { return catalog.loadTable(tableId); } diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestFixtures.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestFixtures.java index a2ca86d1b5a2..a3ab3c8b50d4 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestFixtures.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestFixtures.java @@ -36,6 +36,13 @@ public class TestFixtures { new Schema( required(1, "id", Types.LongType.get()), optional(2, "data", Types.StringType.get())); + public static final Schema NESTED_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + optional( + 3, "metadata", Types.StructType.of(optional(4, "source", Types.StringType.get())))); + public static final List> FILE1SNAPSHOT1_DATA = ImmutableList.of( ImmutableMap.of("id", 0L, "data", "clarification"), From b4c3a4ff927c052d503378aeb1584c82fe457c47 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:43:20 +0000 Subject: [PATCH 37/43] [BigQueryIO] fetch updated schema for newly created Storage API stream writers (#33231) * add dynamic dest test * fix and add some tests * add to changes.md * fix whitespace * trigger postcommits * address comments --- .../beam_PostCommit_Java_DataflowV2.json | 2 +- CHANGES.md | 1 + .../sdk/io/gcp/bigquery/BigQueryServices.java | 2 +- .../io/gcp/bigquery/BigQueryServicesImpl.java | 13 +- .../StorageApiWriteUnshardedRecords.java | 20 +- .../StorageApiWritesShardedRecords.java | 22 ++ .../io/gcp/testing/FakeDatasetService.java | 4 +- .../StorageApiSinkSchemaUpdateIT.java | 206 ++++++++++++++++-- 8 files changed, 236 insertions(+), 34 deletions(-) diff --git a/.github/trigger_files/beam_PostCommit_Java_DataflowV2.json b/.github/trigger_files/beam_PostCommit_Java_DataflowV2.json index 3f63c0c9975f..bbdc3a3910ef 100644 --- a/.github/trigger_files/beam_PostCommit_Java_DataflowV2.json +++ b/.github/trigger_files/beam_PostCommit_Java_DataflowV2.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 2 + "modification": 3 } diff --git a/CHANGES.md b/CHANGES.md index d5cbb76fb3d5..ac3992a872de 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -79,6 +79,7 @@ ## Bugfixes * Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* [BigQueryIO] Fixed an issue where Storage Write API sometimes doesn't pick up auto-schema updates ([#33231](https://github.com/apache/beam/pull/33231)) ## Security Fixes * Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServices.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServices.java index 374288de6562..8dbc47359b70 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServices.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServices.java @@ -213,7 +213,7 @@ WriteStream createWriteStream(String tableUrn, WriteStream.Type type) throws IOException, InterruptedException; @Nullable - WriteStream getWriteStream(String writeStream); + TableSchema getWriteStreamSchema(String writeStream); /** * Create an append client for a given Storage API write stream. The stream must be created diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java index 432f31e81c90..0688694fb9be 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java @@ -76,6 +76,7 @@ import com.google.cloud.bigquery.storage.v1.FinalizeWriteStreamResponse; import com.google.cloud.bigquery.storage.v1.FlushRowsRequest; import com.google.cloud.bigquery.storage.v1.FlushRowsResponse; +import com.google.cloud.bigquery.storage.v1.GetWriteStreamRequest; import com.google.cloud.bigquery.storage.v1.ProtoRows; import com.google.cloud.bigquery.storage.v1.ProtoSchema; import com.google.cloud.bigquery.storage.v1.ReadRowsRequest; @@ -86,6 +87,7 @@ import com.google.cloud.bigquery.storage.v1.StreamWriter; import com.google.cloud.bigquery.storage.v1.TableSchema; import com.google.cloud.bigquery.storage.v1.WriteStream; +import com.google.cloud.bigquery.storage.v1.WriteStreamView; import com.google.cloud.hadoop.util.ApiErrorExtractor; import com.google.cloud.hadoop.util.ChainingHttpRequestInitializer; import com.google.protobuf.DescriptorProtos; @@ -1418,8 +1420,15 @@ public WriteStream createWriteStream(String tableUrn, WriteStream.Type type) } @Override - public @Nullable WriteStream getWriteStream(String writeStream) { - return newWriteClient.getWriteStream(writeStream); + public @Nullable TableSchema getWriteStreamSchema(String writeStream) { + @Nullable + WriteStream stream = + newWriteClient.getWriteStream( + GetWriteStreamRequest.newBuilder() + .setView(WriteStreamView.FULL) + .setName(writeStream) + .build()); + return (stream != null && stream.hasTableSchema()) ? stream.getTableSchema() : null; } @Override diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java index 30a9ecd274dc..1b5f32f7e431 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java @@ -29,7 +29,6 @@ import com.google.cloud.bigquery.storage.v1.Exceptions; import com.google.cloud.bigquery.storage.v1.ProtoRows; import com.google.cloud.bigquery.storage.v1.TableSchema; -import com.google.cloud.bigquery.storage.v1.WriteStream; import com.google.cloud.bigquery.storage.v1.WriteStream.Type; import com.google.protobuf.ByteString; import com.google.protobuf.DescriptorProtos; @@ -475,15 +474,18 @@ SchemaAndDescriptor getCurrentTableSchema(String stream, @Nullable TableSchema u () -> { if (autoUpdateSchema) { @Nullable - WriteStream writeStream = + TableSchema streamSchema = Preconditions.checkStateNotNull(maybeWriteStreamService) - .getWriteStream(streamName); - if (writeStream != null && writeStream.hasTableSchema()) { - TableSchema updatedFromStream = writeStream.getTableSchema(); - currentSchema.set(updatedFromStream); - updated.set(true); - LOG.debug( - "Fetched updated schema for table {}:\n\t{}", tableUrn, updatedFromStream); + .getWriteStreamSchema(streamName); + if (streamSchema != null) { + Optional newSchema = + TableSchemaUpdateUtils.getUpdatedSchema(initialTableSchema, streamSchema); + if (newSchema.isPresent()) { + currentSchema.set(newSchema.get()); + updated.set(true); + LOG.debug( + "Fetched updated schema for table {}:\n\t{}", tableUrn, newSchema.get()); + } } } return null; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java index 738a52b69cb7..d905c4bf93ca 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java @@ -531,6 +531,28 @@ public void process( element.getKey().getKey(), dynamicDestinations, datasetService); tableSchema = converter.getTableSchema(); descriptor = converter.getDescriptor(false); + + if (autoUpdateSchema) { + // A StreamWriter ignores table schema updates that happen prior to its creation. + // So before creating a StreamWriter below, we fetch the table schema to check if we + // missed an update. + // If so, use the new schema instead of the base schema + @Nullable + TableSchema streamSchema = + MoreObjects.firstNonNull( + writeStreamService.getWriteStreamSchema(getOrCreateStream.get()), + TableSchema.getDefaultInstance()); + Optional newSchema = + TableSchemaUpdateUtils.getUpdatedSchema(tableSchema, streamSchema); + + if (newSchema.isPresent()) { + tableSchema = newSchema.get(); + descriptor = + TableRowToStorageApiProto.descriptorSchemaFromTableSchema( + tableSchema, true, false); + updatedSchema.write(tableSchema); + } + } } AppendClientInfo info = AppendClientInfo.of( diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java index a99e4bea37a7..24229643eca3 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java @@ -590,11 +590,11 @@ public WriteStream createWriteStream(String tableUrn, Type type) throws Interrup @Override @Nullable - public WriteStream getWriteStream(String streamName) { + public com.google.cloud.bigquery.storage.v1.TableSchema getWriteStreamSchema(String streamName) { synchronized (FakeDatasetService.class) { @Nullable Stream stream = writeStreams.get(streamName); if (stream != null) { - return stream.toWriteStream(); + return stream.toWriteStream().getTableSchema(); } } // TODO(relax): Return the exact error that BigQuery returns. diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkSchemaUpdateIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkSchemaUpdateIT.java index ae9e9cefb150..3118e97b2b93 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkSchemaUpdateIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkSchemaUpdateIT.java @@ -19,6 +19,8 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects.firstNonNull; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assume.assumeTrue; @@ -33,7 +35,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Random; import java.util.Set; import java.util.function.Function; @@ -60,6 +64,7 @@ import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.joda.time.Duration; @@ -124,6 +129,9 @@ public static Iterable data() { private static final int TOTAL_N = 70; // Number of rows with the original schema private static final int ORIGINAL_N = 60; + // for dynamic destination test + private static final int NUM_DESTINATIONS = 3; + private static final int TOTAL_NUM_STREAMS = 9; private final Random randomGenerator = new Random(); @@ -145,6 +153,11 @@ public static void cleanUp() { } private String createTable(TableSchema tableSchema) throws IOException, InterruptedException { + return createTable(tableSchema, ""); + } + + private String createTable(TableSchema tableSchema, String suffix) + throws IOException, InterruptedException { String tableId = Iterables.get(Splitter.on('[').split(testName.getMethodName()), 0); if (useInputSchema) { tableId += "WithInputSchema"; @@ -152,6 +165,8 @@ private String createTable(TableSchema tableSchema) throws IOException, Interrup if (changeTableSchema) { tableId += "OnSchemaChange"; } + tableId += suffix; + BQ_CLIENT.deleteTable(PROJECT, BIG_QUERY_DATASET_ID, tableId); BQ_CLIENT.createNewTable( PROJECT, @@ -170,9 +185,8 @@ static class UpdateSchemaDoFn extends DoFn, TableRow> { private final String projectId; private final String datasetId; - private final String tableId; // represent as String because TableSchema is not serializable - private final String newSchema; + private final Map newSchemas; private transient BigqueryClient bqClient; @@ -183,11 +197,14 @@ static class UpdateSchemaDoFn extends DoFn, TableRow> { private final StateSpec> counter; public UpdateSchemaDoFn( - String projectId, String datasetId, String tableId, TableSchema newSchema) { + String projectId, String datasetId, Map newSchemas) { this.projectId = projectId; this.datasetId = datasetId; - this.tableId = tableId; - this.newSchema = BigQueryHelpers.toJsonString(newSchema); + Map serializableSchemas = new HashMap<>(); + for (Map.Entry entry : newSchemas.entrySet()) { + serializableSchemas.put(entry.getKey(), BigQueryHelpers.toJsonString(entry.getValue())); + } + this.newSchemas = serializableSchemas; this.bqClient = null; this.counter = StateSpecs.value(); } @@ -201,14 +218,17 @@ public void setup() { public void processElement(ProcessContext c, @StateId(ROW_COUNTER) ValueState counter) throws Exception { int current = firstNonNull(counter.read(), 0); - // We update schema early on to leave a healthy amount of time for - // StreamWriter to recognize it. - if (current == 10) { - bqClient.updateTableSchema( - projectId, - datasetId, - tableId, - BigQueryHelpers.fromJsonString(newSchema, TableSchema.class)); + // We update schema early on to leave a healthy amount of time for StreamWriter to recognize + // it. + // We also update halfway through so that some writers are created *after* the schema update + if (current == TOTAL_NUM_STREAMS / 2) { + for (Map.Entry entry : newSchemas.entrySet()) { + bqClient.updateTableSchema( + projectId, + datasetId, + entry.getKey(), + BigQueryHelpers.fromJsonString(entry.getValue(), TableSchema.class)); + } } counter.write(++current); @@ -304,7 +324,7 @@ private void runStreamingPipelineWithSchemaChange( p.getOptions().as(BigQueryOptions.class).setStorageApiAppendThresholdBytes(0); // Limit parallelism so that all streams recognize the new schema in an expected short amount // of time (before we start writing rows with updated schema) - p.getOptions().as(BigQueryOptions.class).setNumStorageWriteApiStreams(3); + p.getOptions().as(BigQueryOptions.class).setNumStorageWriteApiStreams(TOTAL_NUM_STREAMS); // Need to manually enable streaming engine for legacy dataflow runner ExperimentalOptions.addExperiment( p.getOptions().as(ExperimentalOptions.class), GcpOptions.STREAMING_ENGINE_EXPERIMENT); @@ -394,7 +414,8 @@ private void runStreamingPipelineWithSchemaChange( .apply( "Update Schema", ParDo.of( - new UpdateSchemaDoFn(PROJECT, BIG_QUERY_DATASET_ID, tableId, updatedSchema))); + new UpdateSchemaDoFn( + PROJECT, BIG_QUERY_DATASET_ID, ImmutableMap.of(tableId, updatedSchema)))); } WriteResult result = rows.apply("Stream to BigQuery", write); if (useIgnoreUnknownValues) { @@ -494,13 +515,13 @@ public void checkRowsWithUpdatedSchema( if (Integer.parseInt((String) row.get("id")) < ORIGINAL_N || !useAutoSchemaUpdate || !changeTableSchema) { - assertTrue( + assertNull( String.format("Expected row to NOT have field %s:\n%s", extraField, row), - row.get(extraField) == null); + row.get(extraField)); } else { - assertTrue( + assertNotNull( String.format("Expected row to have field %s:\n%s", extraField, row), - row.get(extraField) != null); + row.get(extraField)); } } } @@ -539,4 +560,151 @@ public void testAtLeastOnceWithIgnoreUnknownValues() throws Exception { public void testAtLeastOnceWithAutoSchemaUpdate() throws Exception { runStreamingPipelineWithSchemaChange(Write.Method.STORAGE_API_AT_LEAST_ONCE, true, true); } + + public void runDynamicDestinationsWithAutoSchemaUpdate(boolean useAtLeastOnce) throws Exception { + Pipeline p = Pipeline.create(TestPipeline.testingPipelineOptions()); + // 0 threshold so that the stream tries fetching an updated schema after each append + p.getOptions().as(BigQueryOptions.class).setStorageApiAppendThresholdBytes(0); + // Total streams per destination + p.getOptions() + .as(BigQueryOptions.class) + .setNumStorageWriteApiStreams(TOTAL_NUM_STREAMS / NUM_DESTINATIONS); + // Need to manually enable streaming engine for legacy dataflow runner + ExperimentalOptions.addExperiment( + p.getOptions().as(ExperimentalOptions.class), GcpOptions.STREAMING_ENGINE_EXPERIMENT); + // Only run the most relevant test case on Dataflow + if (p.getOptions().getRunner().getName().contains("DataflowRunner")) { + assumeTrue( + "Skipping in favor of more relevant test case", changeTableSchema && useInputSchema); + } + + List fieldNamesOrigin = new ArrayList(Arrays.asList(FIELDS)); + + // Shuffle the fields in the write schema to do fuzz testing on field order + List fieldNamesShuffled = new ArrayList(fieldNamesOrigin); + Collections.shuffle(fieldNamesShuffled, randomGenerator); + TableSchema bqTableSchema = makeTableSchemaFromTypes(fieldNamesOrigin, null); + TableSchema inputSchema = makeTableSchemaFromTypes(fieldNamesShuffled, null); + + Map destinations = new HashMap<>(NUM_DESTINATIONS); + Map updatedSchemas = new HashMap<>(NUM_DESTINATIONS); + Map extraFields = new HashMap<>(NUM_DESTINATIONS); + Map rowFuncs = new HashMap<>(NUM_DESTINATIONS); + for (int i = 0; i < NUM_DESTINATIONS; i++) { + // The updated schema includes all fields in the original schema plus a random new field + List fieldNamesWithExtra = new ArrayList(fieldNamesOrigin); + String extraField = + fieldNamesOrigin.get(randomGenerator.nextInt(fieldNamesOrigin.size())) + "_EXTRA"; + fieldNamesWithExtra.add(extraField); + TableSchema updatedSchema = + makeTableSchemaFromTypes(fieldNamesWithExtra, ImmutableSet.of(extraField)); + GenerateRowFunc generateRowFunc = new GenerateRowFunc(fieldNamesOrigin, fieldNamesWithExtra); + + String tableId = createTable(bqTableSchema, "_dynamic_" + i); + String tableSpec = PROJECT + ":" + BIG_QUERY_DATASET_ID + "." + tableId; + + rowFuncs.put((long) i, generateRowFunc); + destinations.put((long) i, tableSpec); + updatedSchemas.put(tableId, updatedSchema); + extraFields.put(tableSpec, extraField); + } + + // build write transform + Write write = + BigQueryIO.writeTableRows() + .to( + row -> { + long l = (int) row.getValue().get("id") % NUM_DESTINATIONS; + String destination = destinations.get(l); + return new TableDestination(destination, null); + }) + .withAutoSchemaUpdate(true) + .ignoreUnknownValues() + .withMethod(Write.Method.STORAGE_API_AT_LEAST_ONCE) + .withCreateDisposition(CreateDisposition.CREATE_NEVER) + .withWriteDisposition(WriteDisposition.WRITE_APPEND); + if (useInputSchema) { + write = write.withSchema(inputSchema); + } + if (!useAtLeastOnce) { + write = + write + .withMethod(Write.Method.STORAGE_WRITE_API) + .withTriggeringFrequency(Duration.standardSeconds(1)); + } + + int numRows = TOTAL_N; + // set up and build pipeline + Instant start = new Instant(0); + // We give a healthy waiting period between each element to give Storage API streams a chance to + // recognize the new schema. Apply on relevant tests. + Duration interval = changeTableSchema ? Duration.standardSeconds(1) : Duration.millis(1); + Duration stop = + changeTableSchema ? Duration.standardSeconds(numRows - 1) : Duration.millis(numRows - 1); + Function getIdFromInstant = + changeTableSchema + ? (Function & Serializable) + (Instant instant) -> instant.getMillis() / 1000 + : (Function & Serializable) Instant::getMillis; + + // Generates rows with original schema up for row IDs under ORIGINAL_N + // Then generates rows with updated schema for the rest + // Rows with updated schema should only reach the table if ignoreUnknownValues is set, + // and the extra field should be present only when autoSchemaUpdate is set + PCollection instants = + p.apply( + "Generate Instants", + PeriodicImpulse.create() + .startAt(start) + .stopAt(start.plus(stop)) + .withInterval(interval) + .catchUpToNow(false)); + PCollection rows = + instants.apply( + "Create TableRows", + MapElements.into(TypeDescriptor.of(TableRow.class)) + .via( + instant -> { + long rowId = getIdFromInstant.apply(instant); + long dest = rowId % NUM_DESTINATIONS; + return rowFuncs.get(dest).apply(rowId); + })); + if (changeTableSchema) { + rows = + rows + // UpdateSchemaDoFn uses state, so need to have a KV input + .apply("Add a dummy key", WithKeys.of(1)) + .apply( + "Update Schema", + ParDo.of(new UpdateSchemaDoFn(PROJECT, BIG_QUERY_DATASET_ID, updatedSchemas))); + } + + WriteResult result = rows.apply("Stream to BigQuery", write); + // We ignore the extra fields, so no rows should have been sent to DLQ + PAssert.that("Check DLQ is empty", result.getFailedStorageApiInserts()).empty(); + p.run().waitUntilFinish(); + + Map expectedCounts = new HashMap<>(NUM_DESTINATIONS); + for (int i = 0; i < numRows; i++) { + long mod = i % NUM_DESTINATIONS; + String destination = destinations.get(mod); + expectedCounts.merge(destination, 1, Integer::sum); + } + + for (Map.Entry expectedCount : expectedCounts.entrySet()) { + String dest = expectedCount.getKey(); + checkRowCompleteness(dest, expectedCount.getValue(), true); + checkRowsWithUpdatedSchema(dest, extraFields.get(dest), true); + } + } + + @Test + public void testExactlyOnceDynamicDestinationsWithAutoSchemaUpdate() throws Exception { + runDynamicDestinationsWithAutoSchemaUpdate(false); + } + + @Test + public void testAtLeastOnceDynamicDestinationsWithAutoSchemaUpdate() throws Exception { + runDynamicDestinationsWithAutoSchemaUpdate(true); + } } From 5a3ddc4464454ea98b2c2093464313e5dd730883 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Wed, 8 Jan 2025 20:29:14 +0000 Subject: [PATCH 38/43] [Iceberg] cleanup FileIO resources (#33509) * cleanup FileIO resources * trigger integration tests * cleanup --- .../trigger_files/IO_Iceberg_Integration_Tests.json | 2 +- .../beam/sdk/io/iceberg/AppendFilesToTables.java | 10 +++++----- .../org/apache/beam/sdk/io/iceberg/RecordWriter.java | 6 +++++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json index 62ae7886c573..1efc8e9e4405 100644 --- a/.github/trigger_files/IO_Iceberg_Integration_Tests.json +++ b/.github/trigger_files/IO_Iceberg_Integration_Tests.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 4 + "modification": 1 } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java index deec779c6cc9..72220faf3004 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java @@ -48,7 +48,6 @@ import org.apache.iceberg.Table; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -187,8 +186,10 @@ private void appendManifestFiles(Table table, Iterable fileWrit int specId = entry.getKey(); List files = entry.getValue(); PartitionSpec spec = Preconditions.checkStateNotNull(specs.get(specId)); - ManifestWriter writer = - createManifestWriter(table.location(), uuid, spec, table.io()); + ManifestWriter writer; + try (FileIO io = table.io()) { + writer = createManifestWriter(table.location(), uuid, spec, io); + } for (DataFile file : files) { writer.add(file); committedDataFileByteSize.update(file.fileSizeInBytes()); @@ -207,8 +208,7 @@ private ManifestWriter createManifestWriter( String.format( "%s/metadata/%s-%s-%s.manifest", tableLocation, manifestFilePrefix, uuid, spec.specId())); - OutputFile outputFile = io.newOutputFile(location); - return ManifestFiles.write(spec, outputFile); + return ManifestFiles.write(spec, io.newOutputFile(location)); } } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java index 7941c13b0dfe..0d81d868c49f 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java @@ -29,6 +29,7 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.data.parquet.GenericParquetWriter; import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; import org.slf4j.Logger; @@ -66,7 +67,10 @@ class RecordWriter { fileFormat.addExtension( table.locationProvider().newDataLocation(table.spec(), partitionKey, filename)); } - OutputFile outputFile = table.io().newOutputFile(absoluteFilename); + OutputFile outputFile; + try (FileIO io = table.io()) { + outputFile = io.newOutputFile(absoluteFilename); + } switch (fileFormat) { case AVRO: From a19466ad93e39abb8d3c9527b1cff617a188a9e3 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Wed, 8 Jan 2025 16:49:45 -0500 Subject: [PATCH 39/43] Pin protobuf 3 for debezium (#33541) * Pin protobuf 3 for debezium * CHANGES --- CHANGES.md | 2 ++ sdks/java/io/debezium/build.gradle | 11 +++++++++++ .../beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java | 1 - 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index ac3992a872de..189bd8c1f71c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -67,10 +67,12 @@ ## New Features / Improvements * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Upgraded to protobuf 4 (Java) ([#33192](https://github.com/apache/beam/issues/33192)). ## Breaking Changes * AWS V1 I/Os have been removed (Java). As part of this, x-lang Python Kinesis I/O has been updated to consume the V2 IO and it also no longer supports setting producer_properties ([#33430](https://github.com/apache/beam/issues/33430)). +* Upgraded to protobuf 4 (Java) ([#33192](https://github.com/apache/beam/issues/33192)), but forced Debezium IO to use protobuf 3 ([#33541](https://github.com/apache/beam/issues/33541) because Debezium clients are not protobuf 4 compatible. This may cause conflicts when using clients which are only compatible with protobuf 4. ## Deprecations diff --git a/sdks/java/io/debezium/build.gradle b/sdks/java/io/debezium/build.gradle index e3b88e22607d..d8ab954b4473 100644 --- a/sdks/java/io/debezium/build.gradle +++ b/sdks/java/io/debezium/build.gradle @@ -89,3 +89,14 @@ task integrationTest(type: Test, dependsOn: processTestResources) { useJUnit { } } + +configurations.all (Configuration it) -> { + resolutionStrategy { + // Force protobuf 3 because debezium is currently incompatible with protobuf 4. + // TODO - remove this and upgrade the version of debezium once a proto-4 compatible version is available + // https://github.com/apache/beam/pull/33526 does some of this, but was abandoned because it still doesn't + // work with protobuf 4. + force "com.google.protobuf:protobuf-java:3.25.5" + force "com.google.protobuf:protobuf-java-util:3.25.5" + } +} diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java index de6c14722898..7fb30a1099aa 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java @@ -17,7 +17,6 @@ */ package org.apache.beam.sdk.io.gcp.pubsub; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; import com.google.auth.Credentials; From 4fc5c869843e9439b3c73f80480d9b9ef651858a Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Wed, 8 Jan 2025 16:34:11 -0800 Subject: [PATCH 40/43] [#31438] WindowingStrategy Plumbing + AllowedLateness + Fixing Session Windows (#33542) --- runners/prism/java/build.gradle | 15 -- .../prism/internal/engine/elementmanager.go | 44 +++--- .../runners/prism/internal/engine/strategy.go | 28 ++-- .../prism/internal/engine/strategy_test.go | 13 +- .../beam/runners/prism/internal/execute.go | 9 +- .../runners/prism/internal/handlerunner.go | 133 ++++++++++-------- .../prism/internal/jobservices/management.go | 5 +- .../beam/runners/prism/internal/urns/urns.go | 26 ++-- .../runners/portability/prism_runner_test.py | 23 --- 9 files changed, 129 insertions(+), 167 deletions(-) diff --git a/runners/prism/java/build.gradle b/runners/prism/java/build.gradle index 791952c625f4..3aae54c4a8cc 100644 --- a/runners/prism/java/build.gradle +++ b/runners/prism/java/build.gradle @@ -96,9 +96,6 @@ def sickbayTests = [ 'org.apache.beam.sdk.transforms.GroupByKeyTest$BasicTests.testAfterProcessingTimeContinuationTriggerEarly', 'org.apache.beam.sdk.transforms.ParDoTest$BundleInvariantsTests.testWatermarkUpdateMidBundle', 'org.apache.beam.sdk.transforms.ViewTest.testTriggeredLatestSingleton', - // Requires Allowed Lateness, among others. - 'org.apache.beam.sdk.transforms.ParDoTest$TimerTests.testEventTimeTimerSetWithinAllowedLateness', - 'org.apache.beam.sdk.transforms.ParDoTest$StateTests.testRequiresTimeSortedInputWithLateDataAndAllowedLateness', 'org.apache.beam.sdk.testing.TestStreamTest.testFirstElementLate', 'org.apache.beam.sdk.testing.TestStreamTest.testDiscardingMode', 'org.apache.beam.sdk.testing.TestStreamTest.testEarlyPanesOfWindow', @@ -116,10 +113,6 @@ def sickbayTests = [ // Coding error somehow: short write: reached end of stream after reading 5 bytes; 98 bytes expected 'org.apache.beam.sdk.testing.TestStreamTest.testMultiStage', - // Prism not firing sessions correctly (seems to be merging inapppropriately) - 'org.apache.beam.sdk.transforms.CombineTest$WindowingTests.testSessionsCombine', - 'org.apache.beam.sdk.transforms.CombineTest$WindowingTests.testSessionsCombineWithContext', - // Java side dying during execution. // https://github.com/apache/beam/issues/32930 'org.apache.beam.sdk.transforms.FlattenTest.testFlattenMultipleCoders', @@ -161,14 +154,6 @@ def sickbayTests = [ // TODO(https://github.com/apache/beam/issues/31231) 'org.apache.beam.sdk.transforms.RedistributeTest.testRedistributePreservesMetadata', - - // These tests fail once Late Data was being precisely dropped. - // They set a single element to be late data, and expect it (correctly) to be preserved. - // Since presently, these are treated as No-ops, the fix is to disable the - // dropping behavior when a stage's input is a Reshuffle/Redistribute transform. - 'org.apache.beam.sdk.transforms.ReshuffleTest.testReshuffleWithTimestampsStreaming', - 'org.apache.beam.sdk.transforms.RedistributeTest.testRedistributeWithTimestampsStreaming', - // Prism isn't handling Java's side input views properly. // https://github.com/apache/beam/issues/32932 // java.lang.IllegalArgumentException: PCollection with more than one element accessed as a singleton view. diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go index bb3c8ceceeb8..fb1ae47d301e 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go @@ -261,8 +261,10 @@ func (em *ElementManager) AddStage(ID string, inputIDs, outputIDs []string, side // StageAggregates marks the given stage as an aggregation, which // means elements will only be processed based on windowing strategies. -func (em *ElementManager) StageAggregates(ID string) { - em.stages[ID].aggregate = true +func (em *ElementManager) StageAggregates(ID string, strat WinStrat) { + ss := em.stages[ID] + ss.aggregate = true + ss.strat = strat } // StageStateful marks the given stage as stateful, which means elements are @@ -1095,7 +1097,7 @@ type stageState struct { // Special handling bits stateful bool // whether this stage uses state or timers, and needs keyed processing. aggregate bool // whether this stage needs to block for aggregation. - strat winStrat // Windowing Strategy for aggregation fireings. + strat WinStrat // Windowing Strategy for aggregation fireings. processingTimeTimersFamilies map[string]bool // Indicates which timer families use the processing time domain. // onWindowExpiration management @@ -1154,7 +1156,6 @@ func makeStageState(ID string, inputIDs, outputIDs []string, sides []LinkID) *st ID: ID, outputIDs: outputIDs, sides: sides, - strat: defaultStrat{}, state: map[LinkID]map[typex.Window]map[string]StateData{}, watermarkHolds: newHoldTracker(), @@ -1179,18 +1180,21 @@ func makeStageState(ID string, inputIDs, outputIDs []string, sides []LinkID) *st func (ss *stageState) AddPending(newPending []element) int { ss.mu.Lock() defer ss.mu.Unlock() - // TODO(#https://github.com/apache/beam/issues/31438): - // Adjust with AllowedLateness - // Data that arrives after the *output* watermark is late. - threshold := ss.output - origPending := make([]element, 0, ss.pending.Len()) - for _, e := range newPending { - if e.window.MaxTimestamp() < threshold { - continue + if ss.aggregate { + // Late Data is data that has arrived after that window has expired. + // We only need to drop late data before aggregations. + // TODO - handle for side inputs too. + threshold := ss.output + origPending := make([]element, 0, ss.pending.Len()) + for _, e := range newPending { + if ss.strat.EarliestCompletion(e.window) < threshold { + // TODO: figure out Pane and trigger firings. + continue + } + origPending = append(origPending, e) } - origPending = append(origPending, e) + newPending = origPending } - newPending = origPending if ss.stateful { if ss.pendingByKeys == nil { ss.pendingByKeys = map[string]*dataAndTimers{} @@ -1626,10 +1630,8 @@ func (ss *stageState) updateWatermarks(em *ElementManager) set[string] { // They'll never be read in again. for _, wins := range ss.sideInputs { for win := range wins { - // TODO(#https://github.com/apache/beam/issues/31438): - // Adjust with AllowedLateness // Clear out anything we've already used. - if win.MaxTimestamp() < newOut { + if ss.strat.EarliestCompletion(win) < newOut { // If the expiry is in progress, skip this window. if ss.inProgressExpiredWindows[win] > 0 { continue @@ -1640,9 +1642,7 @@ func (ss *stageState) updateWatermarks(em *ElementManager) set[string] { } for _, wins := range ss.state { for win := range wins { - // TODO(#https://github.com/apache/beam/issues/31438): - // Adjust with AllowedLateness - if win.MaxTimestamp() < newOut { + if ss.strat.EarliestCompletion(win) < newOut { // If the expiry is in progress, skip collecting this window. if ss.inProgressExpiredWindows[win] > 0 { continue @@ -1685,9 +1685,7 @@ func (ss *stageState) createOnWindowExpirationBundles(newOut mtime.Time, em *Ele var preventDownstreamUpdate bool for win, keys := range ss.keysToExpireByWindow { // Check if the window has expired. - // TODO(#https://github.com/apache/beam/issues/31438): - // Adjust with AllowedLateness - if win.MaxTimestamp() >= newOut { + if ss.strat.EarliestCompletion(win) >= newOut { continue } // We can't advance the output watermark if there's garbage to collect. diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/strategy.go b/sdks/go/pkg/beam/runners/prism/internal/engine/strategy.go index 44e6064958c0..b0af2a09a75d 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/strategy.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/strategy.go @@ -23,28 +23,16 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/typex" ) -type winStrat interface { - EarliestCompletion(typex.Window) mtime.Time +// WinStrat configures the windowing strategy for the stage, based on the +// stage's input PCollection. +type WinStrat struct { + AllowedLateness time.Duration // Used to extend duration } -type defaultStrat struct{} - -func (ws defaultStrat) EarliestCompletion(w typex.Window) mtime.Time { - return w.MaxTimestamp() -} - -func (defaultStrat) String() string { - return "default" -} - -type sessionStrat struct { - GapSize time.Duration -} - -func (ws sessionStrat) EarliestCompletion(w typex.Window) mtime.Time { - return w.MaxTimestamp().Add(ws.GapSize) +func (ws WinStrat) EarliestCompletion(w typex.Window) mtime.Time { + return w.MaxTimestamp().Add(ws.AllowedLateness) } -func (ws sessionStrat) String() string { - return fmt.Sprintf("session[GapSize:%v]", ws.GapSize) +func (ws WinStrat) String() string { + return fmt.Sprintf("WinStrat[AllowedLateness:%v]", ws.AllowedLateness) } diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/strategy_test.go b/sdks/go/pkg/beam/runners/prism/internal/engine/strategy_test.go index 9d558396f806..845f748064c3 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/strategy_test.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/strategy_test.go @@ -26,15 +26,16 @@ import ( func TestEarliestCompletion(t *testing.T) { tests := []struct { - strat winStrat + strat WinStrat input typex.Window want mtime.Time }{ - {defaultStrat{}, window.GlobalWindow{}, mtime.EndOfGlobalWindowTime}, - {defaultStrat{}, window.IntervalWindow{Start: 0, End: 4}, 3}, - {defaultStrat{}, window.IntervalWindow{Start: mtime.MinTimestamp, End: mtime.MaxTimestamp}, mtime.MaxTimestamp - 1}, - {sessionStrat{}, window.IntervalWindow{Start: 0, End: 4}, 3}, - {sessionStrat{GapSize: 3 * time.Millisecond}, window.IntervalWindow{Start: 0, End: 4}, 6}, + {WinStrat{}, window.GlobalWindow{}, mtime.EndOfGlobalWindowTime}, + {WinStrat{}, window.IntervalWindow{Start: 0, End: 4}, 3}, + {WinStrat{}, window.IntervalWindow{Start: mtime.MinTimestamp, End: mtime.MaxTimestamp}, mtime.MaxTimestamp - 1}, + {WinStrat{AllowedLateness: 5 * time.Second}, window.GlobalWindow{}, mtime.EndOfGlobalWindowTime.Add(5 * time.Second)}, + {WinStrat{AllowedLateness: 5 * time.Millisecond}, window.IntervalWindow{Start: 0, End: 4}, 8}, + {WinStrat{AllowedLateness: 5 * time.Second}, window.IntervalWindow{Start: mtime.MinTimestamp, End: mtime.MaxTimestamp}, mtime.MaxTimestamp.Add(5 * time.Second)}, } for _, test := range tests { diff --git a/sdks/go/pkg/beam/runners/prism/internal/execute.go b/sdks/go/pkg/beam/runners/prism/internal/execute.go index d41c3cd9c75c..ecc9cecdaa50 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/execute.go +++ b/sdks/go/pkg/beam/runners/prism/internal/execute.go @@ -223,7 +223,10 @@ func executePipeline(ctx context.Context, wks map[string]*worker.W, j *jobservic KeyDec: kd, } } - em.StageAggregates(stage.ID) + ws := windowingStrategy(comps, tid) + em.StageAggregates(stage.ID, engine.WinStrat{ + AllowedLateness: time.Duration(ws.GetAllowedLateness()) * time.Millisecond, + }) case urns.TransformImpulse: impulses = append(impulses, stage.ID) em.AddStage(stage.ID, nil, []string{getOnlyValue(t.GetOutputs())}, nil) @@ -266,11 +269,11 @@ func executePipeline(ctx context.Context, wks map[string]*worker.W, j *jobservic case *pipepb.TestStreamPayload_Event_ElementEvent: var elms []engine.TestStreamElement for _, e := range ev.ElementEvent.GetElements() { - elms = append(elms, engine.TestStreamElement{Encoded: mayLP(e.GetEncodedElement()), EventTime: mtime.Time(e.GetTimestamp())}) + elms = append(elms, engine.TestStreamElement{Encoded: mayLP(e.GetEncodedElement()), EventTime: mtime.FromMilliseconds(e.GetTimestamp())}) } tsb.AddElementEvent(ev.ElementEvent.GetTag(), elms) case *pipepb.TestStreamPayload_Event_WatermarkEvent: - tsb.AddWatermarkEvent(ev.WatermarkEvent.GetTag(), mtime.Time(ev.WatermarkEvent.GetNewWatermark())) + tsb.AddWatermarkEvent(ev.WatermarkEvent.GetTag(), mtime.FromMilliseconds(ev.WatermarkEvent.GetNewWatermark())) case *pipepb.TestStreamPayload_Event_ProcessingTimeEvent: if ev.ProcessingTimeEvent.GetAdvanceDuration() == int64(mtime.MaxTimestamp) { // TODO: Determine the SDK common formalism for setting processing time to infinity. diff --git a/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go b/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go index be9d39ad02b7..340eb0b7eb5f 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go +++ b/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go @@ -19,7 +19,6 @@ import ( "bytes" "fmt" "io" - "log/slog" "reflect" "sort" @@ -33,7 +32,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism/internal/urns" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism/internal/worker" "google.golang.org/protobuf/encoding/prototext" - "google.golang.org/protobuf/proto" ) // This file retains the logic for the pardo handler @@ -67,7 +65,12 @@ func (*runner) ConfigCharacteristic() reflect.Type { var _ transformPreparer = (*runner)(nil) func (*runner) PrepareUrns() []string { - return []string{urns.TransformReshuffle, urns.TransformFlatten} + return []string{ + urns.TransformReshuffle, + urns.TransformRedistributeArbitrarily, + urns.TransformRedistributeByKey, + urns.TransformFlatten, + } } // PrepareTransform handles special processing with respect runner transforms, like reshuffle. @@ -75,7 +78,7 @@ func (h *runner) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipep switch t.GetSpec().GetUrn() { case urns.TransformFlatten: return h.handleFlatten(tid, t, comps) - case urns.TransformReshuffle: + case urns.TransformReshuffle, urns.TransformRedistributeArbitrarily, urns.TransformRedistributeByKey: return h.handleReshuffle(tid, t, comps) default: panic("unknown urn to Prepare: " + t.GetSpec().GetUrn()) @@ -190,7 +193,13 @@ func (h *runner) handleReshuffle(tid string, t *pipepb.PTransform, comps *pipepb var _ transformExecuter = (*runner)(nil) func (*runner) ExecuteUrns() []string { - return []string{urns.TransformFlatten, urns.TransformGBK, urns.TransformReshuffle} + return []string{ + urns.TransformFlatten, + urns.TransformGBK, + urns.TransformReshuffle, + urns.TransformRedistributeArbitrarily, + urns.TransformRedistributeByKey, + } } // ExecuteWith returns what environment the transform should execute in. @@ -289,6 +298,17 @@ func windowingStrategy(comps *pipepb.Components, tid string) *pipepb.WindowingSt return comps.GetWindowingStrategies()[pcol.GetWindowingStrategyId()] } +// getOrMake is a generic helper function for extracting or initializing a sub map. +// Avoids an amount of boiler plate. +func getOrMake[K, VK comparable, VV any, V map[VK]VV, M map[K]V](m M, key K) V { + v, ok := m[key] + if !ok { + v = make(V) + m[key] = v + } + return v +} + // gbkBytes re-encodes gbk inputs in a gbk result. func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregate [][]byte, coders map[string]*pipepb.Coder) []byte { // Pick how the timestamp of the aggregated output is computed. @@ -325,15 +345,15 @@ func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregat time mtime.Time values [][]byte } - // Map windows to a map of keys to a map of keys to time. + // Map keys to windows to element batches. // We ultimately emit the window, the key, the time, and the iterable of elements, // all contained in the final value. - windows := map[typex.Window]map[string]keyTime{} + keys := map[string]map[typex.Window]keyTime{} kd := pullDecoder(kc, coders) vd := pullDecoder(vc, coders) - // Aggregate by windows and keys, using the window coder and KV coders. + // Aggregate by keys, and windows, using the window coder and KV coders. // We need to extract and split the key bytes from the element bytes. for _, data := range toAggregate { // Parse out each element's data, and repeat. @@ -351,12 +371,8 @@ func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregat key := string(keyByt) value := vd(buf) for _, w := range ws { - wk, ok := windows[w] - if !ok { - wk = make(map[string]keyTime) - windows[w] = wk - } - kt, ok := wk[key] + wins := getOrMake(keys, key) + kt, ok := wins[w] if !ok { // If the window+key map doesn't have a value, inititialize time with the element time. // This allows earliest or latest to work properly in the outputTime function's first use. @@ -366,69 +382,64 @@ func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregat kt.key = keyByt kt.w = w kt.values = append(kt.values, value) - wk[key] = kt + wins[w] = kt } } } // If the strategy is session windows, then we need to get all the windows, sort them // and see which ones need to be merged together. + // Each key has their windows merged separately. if ws.GetWindowFn().GetUrn() == urns.WindowFnSession { - slog.Debug("sorting by session window") - session := &pipepb.SessionWindowsPayload{} - if err := (proto.UnmarshalOptions{}).Unmarshal(ws.GetWindowFn().GetPayload(), session); err != nil { - panic("unable to decode SessionWindowsPayload") - } - gapSize := mtime.Time(session.GetGapSize().AsDuration()) - - ordered := make([]window.IntervalWindow, 0, len(windows)) - for k := range windows { - ordered = append(ordered, k.(window.IntervalWindow)) - } - // Use a decreasing sort (latest to earliest) so we can correct - // the output timestamp to the new end of window immeadiately. - sort.Slice(ordered, func(i, j int) bool { - return ordered[i].MaxTimestamp() > ordered[j].MaxTimestamp() - }) - - cur := ordered[0] - sessionData := windows[cur] - delete(windows, cur) - for _, iw := range ordered[1:] { - // Check if the gap between windows is less than the gapSize. - // If not, this window is done, and we start a next window. - if iw.End+gapSize < cur.Start { - // Store current data with the current window. - windows[cur] = sessionData - // Use the incoming window instead, and clear it from the map. - cur = iw - sessionData = windows[iw] - delete(windows, cur) - // There's nothing to merge, since we've just started with this windowed data. - continue + for _, windows := range keys { + ordered := make([]window.IntervalWindow, 0, len(windows)) + for win := range windows { + ordered = append(ordered, win.(window.IntervalWindow)) } - // Extend the session with the incoming window, and merge the the incoming window's data. - cur.Start = iw.Start - toMerge := windows[iw] - delete(windows, iw) - for k, kt := range toMerge { - skt := sessionData[k] + // Use a decreasing sort (latest to earliest) so we can correct + // the output timestamp to the new end of window immeadiately. + sort.Slice(ordered, func(i, j int) bool { + return ordered[i].MaxTimestamp() > ordered[j].MaxTimestamp() + }) + + cur := ordered[0] + sessionData := windows[cur] + delete(windows, cur) + for _, iw := range ordered[1:] { + // GapSize is already incorporated into the windows, + // check for consecutive windows that don't overlap. + if cur.Start-iw.End > 0 { + // If so, this window is done, and we start a next window. + // Store current data with the current window. + windows[cur] = sessionData + // Use the incoming window instead, and clear it from the map. + cur = iw + sessionData = windows[iw] + delete(windows, cur) + // There's nothing to merge, since we've just started with this windowed data. + continue + } + // Extend the session with the incoming window, and merge the the incoming window's data. + cur.Start = iw.Start + toMerge := windows[iw] + delete(windows, iw) + // Ensure the output time matches the given function. - skt.time = outputTime(cur, kt.time, skt.time) - skt.key = kt.key - skt.w = cur - skt.values = append(skt.values, kt.values...) - sessionData[k] = skt + sessionData.time = outputTime(cur, toMerge.time, sessionData.time) + sessionData.key = toMerge.key + sessionData.w = cur + // TODO: May need to adjust the ordering here. + sessionData.values = append(sessionData.values, toMerge.values...) } + windows[cur] = sessionData } - windows[cur] = sessionData } // Everything's aggregated! // Time to turn things into a windowed KV> var buf bytes.Buffer - for _, w := range windows { - for _, kt := range w { + for _, wins := range keys { + for _, kt := range wins { exec.EncodeWindowedValueHeader( wEnc, []typex.Window{kt.w}, diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go index b9a28e4bc652..a4307b706fa3 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go @@ -23,7 +23,6 @@ import ( "sync" "sync/atomic" - "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime" jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism/internal/urns" @@ -144,7 +143,7 @@ func (s *Server) Prepare(ctx context.Context, req *jobpb.PrepareJobRequest) (_ * urns.TransformAssignWindows: // Very few expected transforms types for submitted pipelines. // Most URNs are for the runner to communicate back to the SDK for execution. - case urns.TransformReshuffle: + case urns.TransformReshuffle, urns.TransformRedistributeArbitrarily, urns.TransformRedistributeByKey: // Reshuffles use features we don't yet support, but we would like to // support them by making them the no-op they are, and be precise about // what we're ignoring. @@ -227,8 +226,6 @@ func (s *Server) Prepare(ctx context.Context, req *jobpb.PrepareJobRequest) (_ * // Inspect Windowing strategies for unsupported features. for wsID, ws := range job.Pipeline.GetComponents().GetWindowingStrategies() { - check("WindowingStrategy.AllowedLateness", ws.GetAllowedLateness(), int64(0), mtime.MaxTimestamp.Milliseconds()) - // Both Closing behaviors are identical without additional trigger firings. check("WindowingStrategy.ClosingBehaviour", ws.GetClosingBehavior(), pipepb.ClosingBehavior_EMIT_IF_NONEMPTY, pipepb.ClosingBehavior_EMIT_ALWAYS) check("WindowingStrategy.AccumulationMode", ws.GetAccumulationMode(), pipepb.AccumulationMode_DISCARDING) diff --git a/sdks/go/pkg/beam/runners/prism/internal/urns/urns.go b/sdks/go/pkg/beam/runners/prism/internal/urns/urns.go index 12e62ef84a81..170073b72419 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/urns/urns.go +++ b/sdks/go/pkg/beam/runners/prism/internal/urns/urns.go @@ -56,18 +56,20 @@ var ( var ( // SDK transforms. - TransformParDo = ptUrn(pipepb.StandardPTransforms_PAR_DO) - TransformCombinePerKey = ctUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY) - TransformCombineGlobally = ctUrn(pipepb.StandardPTransforms_COMBINE_GLOBALLY) - TransformReshuffle = ctUrn(pipepb.StandardPTransforms_RESHUFFLE) - TransformCombineGroupedValues = cmbtUrn(pipepb.StandardPTransforms_COMBINE_GROUPED_VALUES) - TransformPreCombine = cmbtUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY_PRECOMBINE) - TransformMerge = cmbtUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY_MERGE_ACCUMULATORS) - TransformExtract = cmbtUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY_EXTRACT_OUTPUTS) - TransformPairWithRestriction = sdfUrn(pipepb.StandardPTransforms_PAIR_WITH_RESTRICTION) - TransformSplitAndSize = sdfUrn(pipepb.StandardPTransforms_SPLIT_AND_SIZE_RESTRICTIONS) - TransformProcessSizedElements = sdfUrn(pipepb.StandardPTransforms_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS) - TransformTruncate = sdfUrn(pipepb.StandardPTransforms_TRUNCATE_SIZED_RESTRICTION) + TransformParDo = ptUrn(pipepb.StandardPTransforms_PAR_DO) + TransformCombinePerKey = ctUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY) + TransformCombineGlobally = ctUrn(pipepb.StandardPTransforms_COMBINE_GLOBALLY) + TransformReshuffle = ctUrn(pipepb.StandardPTransforms_RESHUFFLE) + TransformRedistributeArbitrarily = ctUrn(pipepb.StandardPTransforms_REDISTRIBUTE_ARBITRARILY) + TransformRedistributeByKey = ctUrn(pipepb.StandardPTransforms_REDISTRIBUTE_BY_KEY) + TransformCombineGroupedValues = cmbtUrn(pipepb.StandardPTransforms_COMBINE_GROUPED_VALUES) + TransformPreCombine = cmbtUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY_PRECOMBINE) + TransformMerge = cmbtUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY_MERGE_ACCUMULATORS) + TransformExtract = cmbtUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY_EXTRACT_OUTPUTS) + TransformPairWithRestriction = sdfUrn(pipepb.StandardPTransforms_PAIR_WITH_RESTRICTION) + TransformSplitAndSize = sdfUrn(pipepb.StandardPTransforms_SPLIT_AND_SIZE_RESTRICTIONS) + TransformProcessSizedElements = sdfUrn(pipepb.StandardPTransforms_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS) + TransformTruncate = sdfUrn(pipepb.StandardPTransforms_TRUNCATE_SIZED_RESTRICTION) // Window Manipulation TransformAssignWindows = ptUrn(pipepb.StandardPTransforms_ASSIGN_WINDOWS) diff --git a/sdks/python/apache_beam/runners/portability/prism_runner_test.py b/sdks/python/apache_beam/runners/portability/prism_runner_test.py index 337ac9919487..1e8880a60a35 100644 --- a/sdks/python/apache_beam/runners/portability/prism_runner_test.py +++ b/sdks/python/apache_beam/runners/portability/prism_runner_test.py @@ -34,9 +34,6 @@ from apache_beam.options.pipeline_options import PortableOptions from apache_beam.runners.portability import portable_runner_test from apache_beam.testing.util import assert_that -from apache_beam.testing.util import equal_to -from apache_beam.transforms import window -from apache_beam.utils import timestamp # Run as # @@ -174,26 +171,6 @@ def create_options(self): return options - # Slightly more robust session window test: - # Validates that an inner grouping doesn't duplicate data either. - # Copied also because the timestamp in fn_runner_test.py isn't being - # inferred correctly as seconds for some reason, but as micros. - # The belabored specification is validating the timestamp type works at least. - # See https://github.com/apache/beam/issues/32085 - def test_windowing(self): - with self.create_pipeline() as p: - res = ( - p - | beam.Create([1, 2, 100, 101, 102, 123]) - | beam.Map( - lambda t: window.TimestampedValue( - ('k', t), timestamp.Timestamp.of(t).micros)) - | beam.WindowInto(beam.transforms.window.Sessions(10)) - | beam.GroupByKey() - | beam.Map(lambda k_vs1: (k_vs1[0], sorted(k_vs1[1])))) - assert_that( - res, equal_to([('k', [1, 2]), ('k', [100, 101, 102]), ('k', [123])])) - # Can't read host files from within docker, read a "local" file there. def test_read(self): print('name:', __name__) From 6b3783fbc51d1cde4ece7be99c319bff7c87867a Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:03:13 +0000 Subject: [PATCH 41/43] [Managed Iceberg] support BQMS catalog (#33511) * Add BQMS catalog * trigger integration tests * build fix * use shaded jar * shadowClosure * use global timeout for tests * define version in BeamModulePlugin * address comments --- .../beam/gradle/BeamModulePlugin.groovy | 5 ++ sdks/java/io/expansion-service/build.gradle | 6 +- sdks/java/io/iceberg/bqms/build.gradle | 63 +++++++++++++++ sdks/java/io/iceberg/build.gradle | 8 ++ .../catalog/BigQueryMetastoreCatalogIT.java | 76 +++++++++++++++++++ .../sdk/io/iceberg/catalog/HiveCatalogIT.java | 1 - .../iceberg/catalog/IcebergCatalogBaseIT.java | 10 ++- settings.gradle.kts | 2 + 8 files changed, 166 insertions(+), 5 deletions(-) create mode 100644 sdks/java/io/iceberg/bqms/build.gradle create mode 100644 sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/BigQueryMetastoreCatalogIT.java diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index 7b791ef9aa8e..76bc7f8a819f 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -618,6 +618,7 @@ class BeamModulePlugin implements Plugin { def influxdb_version = "2.19" def httpclient_version = "4.5.13" def httpcore_version = "4.4.14" + def iceberg_bqms_catalog_version = "1.5.2-0.1.0" def jackson_version = "2.15.4" def jaxb_api_version = "2.3.3" def jsr305_version = "3.0.2" @@ -650,6 +651,10 @@ class BeamModulePlugin implements Plugin { // Export Spark versions, so they are defined in a single place only project.ext.spark3_version = spark3_version + // version for BigQueryMetastore catalog (used by sdks:java:io:iceberg:bqms) + // TODO: remove this and download the jar normally when the catalog gets + // open-sourced (https://github.com/apache/iceberg/pull/11039) + project.ext.iceberg_bqms_catalog_version = iceberg_bqms_catalog_version // A map of maps containing common libraries used per language. To use: // dependencies { diff --git a/sdks/java/io/expansion-service/build.gradle b/sdks/java/io/expansion-service/build.gradle index 38bee450e752..433000922b61 100644 --- a/sdks/java/io/expansion-service/build.gradle +++ b/sdks/java/io/expansion-service/build.gradle @@ -59,11 +59,13 @@ dependencies { // **** IcebergIO runtime dependencies **** runtimeOnly library.java.hadoop_auth runtimeOnly library.java.hadoop_client - // Needed when using GCS as the warehouse location. + // For writing to GCS runtimeOnly library.java.bigdataoss_gcs_connector - // Needed for HiveCatalog + // HiveCatalog runtimeOnly ("org.apache.iceberg:iceberg-hive-metastore:1.4.2") runtimeOnly project(path: ":sdks:java:io:iceberg:hive") + // BigQueryMetastoreCatalog (Java 11+) + runtimeOnly project(path: ":sdks:java:io:iceberg:bqms", configuration: "shadow") runtimeOnly library.java.kafka_clients runtimeOnly library.java.slf4j_jdk14 diff --git a/sdks/java/io/iceberg/bqms/build.gradle b/sdks/java/io/iceberg/bqms/build.gradle new file mode 100644 index 000000000000..e42aafc5f424 --- /dev/null +++ b/sdks/java/io/iceberg/bqms/build.gradle @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +plugins { + id 'org.apache.beam.module' +} + +applyJavaNature( + automaticModuleName: 'org.apache.beam.sdk.io.iceberg.bqms', + shadowClosure: {}, + exportJavadoc: false, + publish: false, // it's an intermediate jar for io-expansion-service + validateShadowJar: false +) + +def libDir = "$buildDir/libs" +def bqmsFileName = "iceberg-bqms-catalog-${iceberg_bqms_catalog_version}.jar" +task downloadBqmsJar(type: Copy) { + // TODO: remove this workaround and downlooad normally when the catalog gets open-sourced: + // (https://github.com/apache/iceberg/pull/11039) + def jarUrl = "https://storage.googleapis.com/spark-lib/bigquery/iceberg-bigquery-catalog-${iceberg_bqms_catalog_version}.jar" + def outputDir = file("$libDir") + outputDir.mkdirs() + def destFile = new File(outputDir, bqmsFileName) + + if (!destFile.exists()) { + try { + ant.get(src: jarUrl, dest: destFile) + println "Successfully downloaded BQMS catalog jar: $destFile" + } catch (Exception e) { + println "Could not download $jarUrl: ${e.message}" + } + } +} + +repositories { + flatDir { + dirs "$libDir" + } +} + +compileJava.dependsOn downloadBqmsJar + +dependencies { + implementation files("$libDir/$bqmsFileName") +} + +description = "Apache Beam :: SDKs :: Java :: IO :: Iceberg :: BigQuery Metastore" +ext.summary = "A copy of the BQMS catalog." diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle index 19fcbb7d1ea0..1775dfc5b77b 100644 --- a/sdks/java/io/iceberg/build.gradle +++ b/sdks/java/io/iceberg/build.gradle @@ -83,6 +83,9 @@ dependencies { exclude group: "org.apache.parquet", module: "parquet-hadoop-bundle" } + // BigQueryMetastore catalog dep + testImplementation project(path: ":sdks:java:io:iceberg:bqms", configuration: "shadow") + testRuntimeOnly library.java.slf4j_jdk14 testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") testRuntimeOnly project(path: ":runners:google-cloud-dataflow-java") @@ -136,6 +139,11 @@ task integrationTest(type: Test) { outputs.upToDateWhen { false } include '**/*IT.class' + // BQ metastore catalog doesn't support java 8 + if (project.findProperty('testJavaVersion') == '8' || + JavaVersion.current().equals(JavaVersion.VERSION_1_8)) { + exclude '**/BigQueryMetastoreCatalogIT.class' + } maxParallelForks 4 classpath = sourceSets.test.runtimeClasspath diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/BigQueryMetastoreCatalogIT.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/BigQueryMetastoreCatalogIT.java new file mode 100644 index 000000000000..39920e66199b --- /dev/null +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/BigQueryMetastoreCatalogIT.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.catalog; + +import java.io.IOException; +import java.util.Map; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogUtil; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; + +public class BigQueryMetastoreCatalogIT extends IcebergCatalogBaseIT { + static final String BQMS_CATALOG = "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog"; + static final String DATASET = "managed_iceberg_bqms_tests_no_delete"; + static final long SALT = System.nanoTime(); + + @Override + public String tableId() { + return DATASET + "." + testName.getMethodName() + "_" + SALT; + } + + @Override + public Catalog createCatalog() { + return CatalogUtil.loadCatalog( + BQMS_CATALOG, + "bqms_" + catalogName, + ImmutableMap.builder() + .put("gcp_project", options.getProject()) + .put("gcp_location", "us-central1") + .put("warehouse", warehouse) + .build(), + new Configuration()); + } + + @Override + public void catalogCleanup() throws IOException { + for (TableIdentifier tableIdentifier : catalog.listTables(Namespace.of(DATASET))) { + // only delete tables that were created in this test run + if (tableIdentifier.name().contains(String.valueOf(SALT))) { + catalog.dropTable(tableIdentifier); + } + } + } + + @Override + public Map managedIcebergConfig(String tableId) { + return ImmutableMap.builder() + .put("table", tableId) + .put( + "catalog_properties", + ImmutableMap.builder() + .put("gcp_project", options.getProject()) + .put("gcp_location", "us-central1") + .put("warehouse", warehouse) + .put("catalog-impl", BQMS_CATALOG) + .build()) + .build(); + } +} diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/HiveCatalogIT.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/HiveCatalogIT.java index f31eb19906ff..076d3f4f9db8 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/HiveCatalogIT.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/HiveCatalogIT.java @@ -65,7 +65,6 @@ public Catalog createCatalog() { @Override public void catalogCleanup() throws Exception { - System.out.println("xxx CLEANING UP!"); if (hiveMetastoreExtension != null) { hiveMetastoreExtension.cleanup(); } diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/IcebergCatalogBaseIT.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/IcebergCatalogBaseIT.java index 8e4a74cd61d4..3970fd07c5c6 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/IcebergCatalogBaseIT.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/IcebergCatalogBaseIT.java @@ -85,6 +85,7 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; +import org.junit.rules.Timeout; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -145,7 +146,11 @@ public void setUp() throws Exception { @After public void cleanUp() throws Exception { - catalogCleanup(); + try { + catalogCleanup(); + } catch (Exception e) { + LOG.warn("Catalog cleanup failed.", e); + } try { GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil(); @@ -163,7 +168,7 @@ public void cleanUp() throws Exception { gcsUtil.remove(filesToDelete); } catch (Exception e) { - LOG.warn("Failed to clean up files.", e); + LOG.warn("Failed to clean up GCS files.", e); } } @@ -173,6 +178,7 @@ public void cleanUp() throws Exception { private static final String RANDOM = UUID.randomUUID().toString(); @Rule public TestPipeline pipeline = TestPipeline.create(); @Rule public TestName testName = new TestName(); + @Rule public transient Timeout globalTimeout = Timeout.seconds(300); private static final int NUM_SHARDS = 10; private static final Logger LOG = LoggerFactory.getLogger(IcebergCatalogBaseIT.class); private static final Schema DOUBLY_NESTED_ROW_SCHEMA = diff --git a/settings.gradle.kts b/settings.gradle.kts index 67ba773d4070..69105a9c1cf2 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -350,3 +350,5 @@ include("sdks:java:extensions:combiners") findProject(":sdks:java:extensions:combiners")?.name = "combiners" include("sdks:java:io:iceberg:hive") findProject(":sdks:java:io:iceberg:hive")?.name = "hive" +include("sdks:java:io:iceberg:bqms") +findProject(":sdks:java:io:iceberg:bqms")?.name = "bqms" From c6f8aae7a27300d9fea72bc4375d31c59ea615c9 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Thu, 9 Jan 2025 12:04:34 -0500 Subject: [PATCH 42/43] Temporarily skip deleting beam-performancetests-singlestoreio-12661373176 (#33545) * Add printing to k8s script * Temporarily skip bad namespace * Update stale_k8s_workload_cleaner.sh * Add context * Update for all singlestore io instances --- .test-infra/tools/stale_k8s_workload_cleaner.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.test-infra/tools/stale_k8s_workload_cleaner.sh b/.test-infra/tools/stale_k8s_workload_cleaner.sh index 9ddaf17f2ce8..c167871c8b94 100755 --- a/.test-infra/tools/stale_k8s_workload_cleaner.sh +++ b/.test-infra/tools/stale_k8s_workload_cleaner.sh @@ -43,7 +43,10 @@ function should_teardown() { gcloud container clusters get-credentials io-datastores --zone us-central1-a --project apache-beam-testing while read NAME STATUS AGE; do - if [[ $NAME =~ ^beam-.+(test|-it) ]] && should_teardown $AGE; then + # Regex has temporary workaround to avoid trying to delete beam-performancetests-singlestoreio-* to avoid getting stuck in a terminal state + # See https://github.com/apache/beam/pull/33545 for context. + # This may be safe to remove if https://cloud.google.com/knowledge/kb/deleted-namespace-remains-in-terminating-status-000004867 has been resolved, just try it before checking in :) + if [[ $NAME =~ ^beam-.+(test|-it)(?!s-singlestoreio) ]] && should_teardown $AGE; then kubectl delete namespace $NAME fi done < <( kubectl get namespaces --context=gke_${PROJECT}_${LOCATION}_${CLUSTER} ) From d50cc15c483adb457e448474c0a5401a01e3dcfc Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Thu, 9 Jan 2025 18:09:14 +0000 Subject: [PATCH 43/43] [Managed Iceberg] Fix partition value race condition (#33549) * fix and update tests * dont mention df yet * add PR link * whitespace --- .../IO_Iceberg_Integration_Tests.json | 2 +- CHANGES.md | 10 +++++++ .../sdk/io/iceberg/RecordWriterManager.java | 20 +++++++------ .../iceberg/catalog/IcebergCatalogBaseIT.java | 28 ++++++++++++++++++- 4 files changed, 50 insertions(+), 10 deletions(-) diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json index 1efc8e9e4405..b73af5e61a43 100644 --- a/.github/trigger_files/IO_Iceberg_Integration_Tests.json +++ b/.github/trigger_files/IO_Iceberg_Integration_Tests.json @@ -1,4 +1,4 @@ { - "comment": "Modify this file in a trivial way to cause this test suite to run", + "comment": "Modify this file in a trivial way to cause this test suite to run.", "modification": 1 } diff --git a/CHANGES.md b/CHANGES.md index 189bd8c1f71c..2333c835e038 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -115,6 +115,7 @@ ## Bugfixes * Fixed EventTimeTimer ordering in Prism. ([#32222](https://github.com/apache/beam/issues/32222)). +* [Managed Iceberg] Fixed a bug where DataFile metadata was assigned incorrect partition values ([#33549](https://github.com/apache/beam/pull/33549)). ## Security Fixes @@ -157,6 +158,11 @@ * Adding flag to support conditionally disabling auto-commit in JdbcIO ReadFn ([#31111](https://github.com/apache/beam/issues/31111)) * (Python) Fixed BigQuery Enrichment bug that can lead to multiple conditions returning duplicate rows, batching returning incorrect results and conditions not scoped by row during batching ([#32780](https://github.com/apache/beam/pull/32780)). +## Known Issues + +* [Managed Iceberg] DataFile metadata is assigned incorrect partition values ([#33497](https://github.com/apache/beam/issues/33497)). + * Fixed in 2.62.0 + # [2.60.0] - 2024-10-17 ## Highlights @@ -211,6 +217,8 @@ when running on 3.8. ([#31192](https://github.com/apache/beam/issues/31192)) * Duplicate Rows: Multiple conditions may be applied incorrectly, leading to the duplication of rows in the output. * Incorrect Results with Batched Requests: Conditions may not be correctly scoped to individual rows within the batch, potentially causing inaccurate results. * Fixed in 2.61.0. +* [Managed Iceberg] DataFile metadata is assigned incorrect partition values ([#33497](https://github.com/apache/beam/issues/33497)). + * Fixed in 2.62.0 # [2.59.0] - 2024-09-11 @@ -259,6 +267,8 @@ when running on 3.8. ([#31192](https://github.com/apache/beam/issues/31192)) * Duplicate Rows: Multiple conditions may be applied incorrectly, leading to the duplication of rows in the output. * Incorrect Results with Batched Requests: Conditions may not be correctly scoped to individual rows within the batch, potentially causing inaccurate results. * Fixed in 2.61.0. +* [Managed Iceberg] DataFile metadata is assigned incorrect partition values ([#33497](https://github.com/apache/beam/issues/33497)). + * Fixed in 2.62.0 # [2.58.1] - 2024-08-15 diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriterManager.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriterManager.java index 4c21a0175ab0..63186f26fb5a 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriterManager.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriterManager.java @@ -96,7 +96,9 @@ class DestinationState { private final IcebergDestination icebergDestination; private final PartitionSpec spec; private final org.apache.iceberg.Schema schema; - private final PartitionKey partitionKey; + // used to determine the partition to which a record belongs + // must not be directly used to create a writer + private final PartitionKey routingPartitionKey; private final Table table; private final String stateToken = UUID.randomUUID().toString(); final Cache writers; @@ -109,7 +111,7 @@ class DestinationState { this.icebergDestination = icebergDestination; this.schema = table.schema(); this.spec = table.spec(); - this.partitionKey = new PartitionKey(spec, schema); + this.routingPartitionKey = new PartitionKey(spec, schema); this.table = table; for (PartitionField partitionField : spec.fields()) { partitionFieldMap.put(partitionField.name(), partitionField); @@ -154,12 +156,12 @@ class DestinationState { * can't create a new writer, the {@link Record} is rejected and {@code false} is returned. */ boolean write(Record record) { - partitionKey.partition(getPartitionableRecord(record)); + routingPartitionKey.partition(getPartitionableRecord(record)); - if (!writers.asMap().containsKey(partitionKey) && openWriters >= maxNumWriters) { + if (!writers.asMap().containsKey(routingPartitionKey) && openWriters >= maxNumWriters) { return false; } - RecordWriter writer = fetchWriterForPartition(partitionKey); + RecordWriter writer = fetchWriterForPartition(routingPartitionKey); writer.write(record); return true; } @@ -173,10 +175,12 @@ private RecordWriter fetchWriterForPartition(PartitionKey partitionKey) { RecordWriter recordWriter = writers.getIfPresent(partitionKey); if (recordWriter == null || recordWriter.bytesWritten() > maxFileSize) { + // each writer must have its own PartitionKey object + PartitionKey copy = partitionKey.copy(); // calling invalidate for a non-existent key is a safe operation - writers.invalidate(partitionKey); - recordWriter = createWriter(partitionKey); - writers.put(partitionKey, recordWriter); + writers.invalidate(copy); + recordWriter = createWriter(copy); + writers.put(copy, recordWriter); } return recordWriter; } diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/IcebergCatalogBaseIT.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/IcebergCatalogBaseIT.java index 3970fd07c5c6..df2ca5adb7ac 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/IcebergCatalogBaseIT.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/IcebergCatalogBaseIT.java @@ -28,10 +28,13 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.UUID; +import java.util.function.BiFunction; import java.util.stream.Collectors; import java.util.stream.LongStream; import java.util.stream.Stream; @@ -65,6 +68,7 @@ import org.apache.iceberg.TableScan; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.IdentityPartitionConverters; import org.apache.iceberg.data.Record; import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.data.parquet.GenericParquetWriter; @@ -74,7 +78,10 @@ import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.util.DateTimeUtil; +import org.apache.iceberg.util.PartitionUtil; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; @@ -295,6 +302,22 @@ private List populateTable(Table table) throws IOException { return expectedRows; } + private static Map constantsMap( + FileScanTask task, + BiFunction converter, + org.apache.iceberg.Schema schema) { + PartitionSpec spec = task.spec(); + Set idColumns = spec.identitySourceIds(); + org.apache.iceberg.Schema partitionSchema = TypeUtil.select(schema, idColumns); + boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty(); + + if (projectsIdentityPartitionColumns) { + return PartitionUtil.constantsMap(task, converter); + } else { + return Collections.emptyMap(); + } + } + private List readRecords(Table table) { org.apache.iceberg.Schema tableSchema = table.schema(); TableScan tableScan = table.newScan().project(tableSchema); @@ -303,13 +326,16 @@ private List readRecords(Table table) { InputFilesDecryptor descryptor = new InputFilesDecryptor(task, table.io(), table.encryption()); for (FileScanTask fileTask : task.files()) { + Map idToConstants = + constantsMap(fileTask, IdentityPartitionConverters::convertConstant, tableSchema); InputFile inputFile = descryptor.getInputFile(fileTask); CloseableIterable iterable = Parquet.read(inputFile) .split(fileTask.start(), fileTask.length()) .project(tableSchema) .createReaderFunc( - fileSchema -> GenericParquetReaders.buildReader(tableSchema, fileSchema)) + fileSchema -> + GenericParquetReaders.buildReader(tableSchema, fileSchema, idToConstants)) .filter(fileTask.residual()) .build();