diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext index 87b73efcd..6694aa821 100644 --- a/.github/container/Dockerfile.maxtext +++ b/.github/container/Dockerfile.maxtext @@ -2,43 +2,7 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG SRC_PATH_MAXTEXT=/opt/maxtext -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text - -############################################################################### -## build tensorflow-text and lingvo, which do not have working arm64 pip wheels -############################################################################### - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as wheel-builder - -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ - -# Remove TFTEXT build from source when it has py-3.12 wheels for x86/arm64 -FROM wheel-builder as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT - -RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 -RUN git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -RUN <<"EOF" bash -exu -o pipefail -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF ############################################################################### ## Download source and add auxiliary scripts @@ -46,13 +10,7 @@ EOF FROM ${BASE_IMAGE} as mealkit ARG URLREF_MAXTEXT -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG SRC_PATH_MAXTEXT -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text - -# Preserve version information of tensorflow-text -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ -RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-maxtext.in RUN <<"EOF" bash -ex git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} @@ -85,6 +43,3 @@ FROM mealkit as final RUN pip-finalize.sh WORKDIR ${SRC_PATH_MAXTEXT} - -# When tftext and lingvo wheels are published on pypi.org, revert this -# Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154 diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 938bd853c..5a8c23ef7 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -3,11 +3,9 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax ARG URLREF_PAXML=https://github.com/google/paxml.git#main ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG URLREF_LINGVO=https://github.com/tensorflow/lingvo.git#master ARG SRC_PATH_PAXML=/opt/paxml ARG SRC_PATH_PRAXIS=/opt/praxis -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text ARG SRC_PATH_LINGVO=/opt/lingvo ############################################################################### @@ -17,32 +15,6 @@ ARG SRC_PATH_LINGVO=/opt/lingvo ARG BASE_IMAGE FROM ${BASE_IMAGE} as wheel-builder -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ - -# Remove TFTEXT build from source when it has py-3.12 wheels for x86/arm64 -FROM wheel-builder as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT -RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF - #------------------------------------------------------------------------------ # build lingvo #------------------------------------------------------------------------------ @@ -50,13 +22,8 @@ EOF # Remove Lingvo build from source when it has py-3.12 wheels for x86/arm64 FROM wheel-builder as lingvo-builder ARG URLREF_LINGVO -ARG SRC_PATH_TFTEXT ARG SRC_PATH_LINGVO -# Preserve the version of tensorflow-text -COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ - ENV USE_BAZEL_VERSION=7.1.2 # build lingvo @@ -89,10 +56,9 @@ EOFINNER fi -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 /opt/tensorflow_text*.whl +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 for pattern in \ "s|tensorflow=|#tensorflow=|g" \ - "s|tensorflow-text=|#tensorflow-text=|g" \ "s|dataclasses=|#dataclasses=|g" \ "s|==.*||g" \ ; do @@ -101,7 +67,7 @@ done # Lingvo support only python < 3.12, so we hack it and update dependencies # to be able to build for py-3.12 for pattern in \ - "s|tensorflow-text~=2.13.0|tensorflow-text~=2.18.0|g" \ + "s|tensorflow-text~=2.13.0|tensorflow-text~=2.18.1|g" \ "s|tensorflow~=2.13.0|tensorflow~=2.18.0|g" \ "s|python_requires='>=3.8,<3.11'|python_requires='>=3.8,<3.13'|" \ ; do @@ -128,16 +94,12 @@ ARG URLREF_PAXML ARG URLREF_PRAXIS ARG SRC_PATH_PAXML ARG SRC_PATH_PRAXIS -ARG SRC_PATH_TFTEXT # Preserve version information of tensorflow-text and lingvo COPY --from=lingvo-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*-linux*.whl /opt/ RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/requirements-paxml.in -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ -RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-paxml.in - # paxml + praxis RUN <<"EOF" bash -ex echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/requirements-paxml.in @@ -187,5 +149,5 @@ FROM mealkit as final RUN pip-finalize.sh -# When tftext and lingvo wheels are published on pypi.org, revert this +# When lingvo wheels are published on pypi.org, revert this # Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154 diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index ea4bbf2ec..1568ff559 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -3,64 +3,18 @@ # docker buildx build -f Dockerfile.t5x --tag t5x --build-arg BASE_IMAGE=ghcr.io/nvidia/jax:mealkit-2024-01-22 . ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG URLREF_T5X=https://github.com/google-research/t5x.git#main ARG URLREF_AIRIO=https://github.com/google/airio.git#main -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text ARG SRC_PATH_T5X=/opt/t5x ARG SRC_PATH_AIRIO=/opt/airio - -############################################################################### -## build several packages which do not have working arm64 pip wheels -############################################################################### - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as wheel-builder - -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ -FROM wheel-builder as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT - -RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 -RUN <<"EOF" bash -exu -o pipefail -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF - - -############################################################################### -## T5X -############################################################################### - ARG BASE_IMAGE FROM ${BASE_IMAGE} AS mealkit ARG URLREF_T5X ARG URLREF_AIRIO -ARG SRC_PATH_TFTEXT ARG SRC_PATH_T5X ARG SRC_PATH_AIRIO -# Preserve version information of tensorflow-text -COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ -RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-t5x.in - RUN <<"EOF" bash -ex # 1. Fetch T5X git-clone.sh "${URLREF_T5X}" "${SRC_PATH_T5X}" diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index b9c06e2e6..433871c14 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -59,12 +59,6 @@ lingvo: tracking_ref: master latest_verified_commit: 05a076b0783a8bbf4a770095966c472bb37bbf65 mode: git-clone -tensorflow-text: - # Used only in ARM pax and t5x builds - url: https://github.com/tensorflow/text.git - tracking_ref: master - latest_verified_commit: 1779b3ae16f7bd287c4edcf66d62208dc63256f3 - mode: git-clone pydantic: version: X.Y.Z mode: pip-constraint diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index b4f3b8143..c2b4cb4ee 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -111,7 +111,6 @@ jobs: DOCKERFILE: .github/container/Dockerfile.maxtext EXTRA_BUILD_ARGS: | URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} secrets: inherit build-levanter: @@ -143,7 +142,6 @@ jobs: DOCKERFILE: .github/container/Dockerfile.t5x EXTRA_BUILD_ARGS: | URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} secrets: inherit @@ -161,7 +159,6 @@ jobs: EXTRA_BUILD_ARGS: | URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} - URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} secrets: inherit diff --git a/rosetta/Dockerfile.gemma b/rosetta/Dockerfile.gemma index e7db16dcc..4a0ba2965 100644 --- a/rosetta/Dockerfile.gemma +++ b/rosetta/Dockerfile.gemma @@ -11,40 +11,7 @@ ARG URLREF_FLAXFORMER=https://github.com/google/flaxformer.git#main ARG SRC_PATH_FLAXFORMER=/opt/flaxformer ARG URLREF_PANOPTICAPI=https://github.com/akolesnikoff/panopticapi.git#mute ARG SRC_PATH_PANOPTICAPI=/opt/panopticapi -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text -############################################################################### -## Build several packages which do not have working amd64/arm64 pip wheels -############################################################################### - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as wheel-builder - -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ -FROM wheel-builder as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT - -RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF ############################################################################### ## Download source and add auxiliary scripts @@ -62,11 +29,6 @@ ARG URLREF_FLAXFORMER ARG SRC_PATH_FLAXFORMER ARG URLREF_PANOPTICAPI ARG SRC_PATH_PANOPTICAPI -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT - -COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ RUN <<"EOF" bash -ex git-clone.sh ${URLREF_GEMMA} ${SRC_PATH_GEMMA} @@ -93,7 +55,7 @@ optax protobuf tfds-nightly tensorflow -tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl) +tensorflow-text tensorflow-gan " >> /opt/pip-tools.d/requirements-gemma.in EOF diff --git a/rosetta/rosetta/projects/maxtext/README.md b/rosetta/rosetta/projects/maxtext/README.md index 2320a7ed9..44baa19ef 100644 --- a/rosetta/rosetta/projects/maxtext/README.md +++ b/rosetta/rosetta/projects/maxtext/README.md @@ -93,7 +93,7 @@ We have run some intial performance and functionality tests with [LLaMA2-7B](htt Please refer to the [example run script](scripts/example_slurm.sub) for more details. We will continue to add more models and associated performance metrics. # Notes -1. The only changes we need to support multiprocessing is to pin tensorflow and tensorflow-text to 2.13.0 version. +1. The only changes we need to support multiprocessing is to pin tensorflow and tensorflow-text to 2.18.0 version or higher. 2. In order to remove extra copies introduced by DUS (dynamic update slice) when used in conjunction with custom NVIDIA kernels (like cuBLAS for GEMMs), the `--xla_gpu_enable_custom_fusions` and `--xla_gpu_enable_address_computation_fusion` flags were introduced. However, the current XLA has some limitation and sometimes using these flags lead to error. So, in this release, it is advised to turn off these two flags: - --xla_gpu_enable_custom_fusions=false - --xla_gpu_enable_address_computation_fusion=false