From 63beb2f23fbc453728206db3d53d159df2b806b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=C3=ABl=20Ducharme?= <mducharme@mozilla.com>
Date: Tue, 17 Oct 2023 12:36:51 -0400
Subject: [PATCH] feat: Move utils, operators and glam_subdags out of dags
 directory (#1807)

---
 .circleci/config.yml                          |  60 ++
 README.md                                     |  68 +-
 dags/adm_export.py                            |   1 +
 dags/bhr_collection.py                        |   1 +
 dags/burnham.py                               |  25 +-
 dags/clean_gke_pods.py                        |   1 +
 dags/crash_symbolication.py                   |   1 +
 dags/data_monitoring.py                       |   5 +-
 dags/dim_active_users_aggregates.py           |   1 +
 dags/experiment_auto_sizing.py                |   2 +-
 dags/firefox_public_data_report.py            |   1 +
 dags/fivetran_acoustic.py                     |  10 +-
 dags/glam.py                                  |   9 +-
 dags/glam_dev.py                              |   9 +-
 dags/glam_fenix.py                            |  48 +-
 dags/glam_fog.py                              |   7 +-
 dags/glam_glean_imports.py                    |   1 +
 dags/jetstream.py                             |   2 +-
 dags/merino_jobs.py                           |   5 +-
 dags/operational_monitoring.py                |  14 +-
 dags/overwatch.py                             |   1 +
 dags/partybal.py                              |   1 +
 dags/probe_scraper.py                         |   1 +
 dags/search_term_data_validation_v2.py        |   1 +
 dags/socorro_import.py                        |  27 +-
 dags/taar_daily.py                            |   1 +
 dags/taar_weekly.py                           |  59 +-
 dags/utils/constants.py                       |  11 -
 docker-compose.yml                            |   3 +
 {dags/glam_subdags => operators}/__init__.py  |   0
 {dags/operators => operators}/bq_sensor.py    |   0
 .../gcp_container_operator.py                 |  93 +--
 plugins/backfill/main.py                      | 105 ++-
 pyproject.toml                                |   2 +-
 tests/utils/test_backfill.py                  |   2 +-
 tests/utils/test_tags.py                      |  17 +-
 {dags/operators => utils}/__init__.py         |   0
 {dags/utils => utils/acoustic}/__init__.py    |   0
 .../acoustic/acoustic_client.py               |  47 +-
 {dags/utils => utils}/backfill.py             |   0
 {dags/utils => utils}/callbacks.py            |  17 +-
 utils/constants.py                            |  11 +
 {dags/utils => utils}/dataproc.py             | 631 +++++++++---------
 {dags/utils => utils}/gcp.py                  |   4 +-
 {dags/utils => utils}/gke.py                  |   4 +-
 .../glam_subdags}/__init__.py                 |   0
 {dags => utils}/glam_subdags/extract.py       |   1 +
 {dags => utils}/glam_subdags/general.py       |   1 +
 .../glam_subdags/generate_query.py            |   0
 {dags => utils}/glam_subdags/histograms.py    |   1 +
 {dags => utils}/glam_subdags/probe_hotlist.py |   6 +-
 {dags/utils => utils}/patched/__init__.py     |   0
 .../utils => utils}/patched/dataproc_hook.py  | 390 ++++++-----
 {dags/utils => utils}/slack.py                |  11 +-
 {dags/utils => utils}/tags.py                 |   5 +
 55 files changed, 987 insertions(+), 737 deletions(-)
 delete mode 100644 dags/utils/constants.py
 rename {dags/glam_subdags => operators}/__init__.py (100%)
 rename {dags/operators => operators}/bq_sensor.py (100%)
 rename {dags/operators => operators}/gcp_container_operator.py (59%)
 rename {dags/operators => utils}/__init__.py (100%)
 rename {dags/utils => utils/acoustic}/__init__.py (100%)
 rename {dags/utils => utils}/acoustic/acoustic_client.py (83%)
 rename {dags/utils => utils}/backfill.py (100%)
 rename {dags/utils => utils}/callbacks.py (65%)
 create mode 100644 utils/constants.py
 rename {dags/utils => utils}/dataproc.py (54%)
 rename {dags/utils => utils}/gcp.py (99%)
 rename {dags/utils => utils}/gke.py (97%)
 rename {dags/utils/acoustic => utils/glam_subdags}/__init__.py (100%)
 rename {dags => utils}/glam_subdags/extract.py (99%)
 rename {dags => utils}/glam_subdags/general.py (99%)
 rename {dags => utils}/glam_subdags/generate_query.py (100%)
 rename {dags => utils}/glam_subdags/histograms.py (99%)
 rename {dags => utils}/glam_subdags/probe_hotlist.py (94%)
 rename {dags/utils => utils}/patched/__init__.py (100%)
 rename {dags/utils => utils}/patched/dataproc_hook.py (79%)
 rename {dags/utils => utils}/slack.py (72%)
 rename {dags/utils => utils}/tags.py (85%)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b9171ce69..7358057ab 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -180,6 +180,51 @@ jobs:
           gsutil rsync -d -r dataproc_bootstrap gs://moz-fx-data-prod-airflow-dataproc-artifacts/bootstrap
           gsutil rsync -d -r jobs gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs
 
+  sync-dags-repo:
+    executor: docker/machine
+    steps:
+      - checkout
+      - add_ssh_keys:
+          fingerprints:
+            - "e4:30:50:41:53:f0:d6:3a:bb:c9:38:54:2d:ca:56:41"
+      - run:
+          name: 🤖 Update DAGs repository
+          command: |
+            ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
+            git config --global user.email "dataops@mozilla.com"
+            git config --global user.name "CircleCI Job"
+            git clone --branch main git@github.com:mozilla/telemetry-airflow-dags.git
+            cd ~/project/telemetry-airflow-dags
+            git submodule update --init --recursive --depth 1 telemetry-airflow
+            cd ${CIRCLE_PROJECT_REPONAME}
+            git pull origin dags
+            cd ..
+            git add ${CIRCLE_PROJECT_REPONAME}
+            git commit --allow-empty -m "Automatic commit from ${CIRCLE_PROJECT_REPONAME} commit ${CIRCLE_SHA1:0:9} build ${CIRCLE_BUILD_NUM} [skip ci]"
+            git push origin main
+
+  sync-dags-branch:
+    executor: docker/machine
+    steps:
+      - checkout
+      - add_ssh_keys:
+          fingerprints:
+            - "66:9a:9f:55:2b:5c:f5:d6:ea:c5:c4:f9:7b:db:8d:c0"
+      - run:
+          name: 🤖 Update DAGs branch
+          command: |
+            ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
+            git config --global user.name "CircleCI job"
+            git config --global user.email "dataops@mozilla.com"
+            git clone --branch dags git@github.com:mozilla/telemetry-airflow.git \
+              dags-branch
+            cd dags-branch/
+            rm -rf dags/
+            cp -r ~/project/dags dags
+            git add .
+            git commit -m "Automatic commit - DAGs from commit ${CIRCLE_SHA1:0:9} build ${CIRCLE_BUILD_NUM} [skip ci]" \
+              && git push \
+              || echo "Skipping push since it looks like there were no changes"
 
 workflows:
   ci:
@@ -297,3 +342,18 @@ workflows:
               only: /.*/
             branches:
               only: main
+
+      - sync-dags-branch:
+          name: 🔃 Update DAGs branch
+          filters:
+            branches:
+              only: main
+
+      - sync-dags-repo:
+          name: 🔃 Synchronize telemetry-airflow-dags repository
+          filters:
+            branches:
+              only: main
+          requires:
+            - 🔃 Update DAGs branch
+
diff --git a/README.md b/README.md
index 19a67ca96..eaabb86b2 100644
--- a/README.md
+++ b/README.md
@@ -25,21 +25,65 @@ Some links relevant to users and developers of WTMO:
 ## Writing DAGs
 See the Airflow's [Best Practices guide](https://airflow.apache.org/docs/apache-airflow/stable/best-practices.html#best-practices) to help you write DAGs.
 
-**⚠ Note: How to import DAGs and modules ⚠** 
+### **⚠ Warning: Do not import resources from the `dags` directory in DAGs definition files ⚠** 
+As an example, if you have `dags/dag_a.py` and `dags/dag_b.py` and want to use a helper
+function in both DAG definition files, define the helper function in the `utils` directory
+such as: 
+
+#### `utils/helper.py`
+```python
+def helper_function():
+    return "Help"
+```
+
+#### `dags/dag_a.py`
+```python
+from airflow import DAG
+
+from utils.helper import helper_function
+
+with DAG("dag_a", ...):
+    ...
+```
 
-**Modules should be imported from the project directory, such as `from dags.my_dag import load_data` 
-rather than `from my_dag import load_data`.**
+#### `dags/dag_b.py`
+```python
+from airflow import DAG
 
-In Airflow, the `dags`, `config`, and `plugins` folders are [automatically added](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/modules_management.html#built-in-pythonpath-entries-in-airflow) to the 
-`PYTHONPATH` to ensure they can be imported and accessed by Airflow's execution environment. 
+from utils.helper import helper_function
+
+with DAG("dag_b", ...):
+    ...
+```
+
+WTMO deployments use git-sync sidecars to synchronize DAG files from multiple 
+repositories via [telemetry-airflow-dags](https://github.com/mozilla/telemetry-airflow-dags/) 
+using git submodules. Git-sync sidecar pattern results in the following directory structure
+once deployed.
+```
+airflow
+├─ dags
+│  └── repo
+│      └── telemetry-airflow-dags
+│          ├── <submodule repo_a>
+│          │    └── dags
+│          │        └── <dag files>
+│          ├── <submodule repo_b>
+│          │    └── dags
+│          │        └── <dag files>
+│          └── <submodule repo_c>
+│               └── dags
+│                   └── <dag files>
+├─ utils
+│  └── ...
+└─ plugins
+   └── ...
+```
+Hence, defining `helper_function()` in `dags/dag_a.py` and 
+importing the function in `dags/dag_b.py` as `from dags.dag_a import helper_function` 
+**will not work after deployment** because of the directory structured required for
+git-sync sidecars.
 
-However, this default configuration can cause problems when running unit tests located 
-in the `tests` directory. Since the `PYTHONPATH` includes the `dags` directory, 
-but not the project directory itself, the unit tests will not be able to import code from 
-the `dags` directory. This limitation restricts the ability to test the DAGs effectively 
-within the project structure. It is also generally expected that imports should work from the 
-project directory rather than from any of its subdirectories. For this reason, `telemetry-airflow`'s 
-Dockerfile adds the project directory to `PYTHONPATH`.
 
 ## Prerequisites
 
diff --git a/dags/adm_export.py b/dags/adm_export.py
index 5affd295f..dd8fd1b14 100644
--- a/dags/adm_export.py
+++ b/dags/adm_export.py
@@ -3,6 +3,7 @@
 from airflow import DAG
 from airflow.hooks.base import BaseHook
 from airflow.sensors.external_task import ExternalTaskSensor
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.constants import ALLOWED_STATES, FAILED_STATES
 from utils.tags import Tag
diff --git a/dags/bhr_collection.py b/dags/bhr_collection.py
index 1a1261e39..8d7f3f971 100644
--- a/dags/bhr_collection.py
+++ b/dags/bhr_collection.py
@@ -21,6 +21,7 @@
 from airflow.operators.subdag import SubDagOperator
 from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
 from airflow.sensors.external_task import ExternalTaskSensor
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.constants import ALLOWED_STATES, FAILED_STATES
 from utils.dataproc import get_dataproc_parameters, moz_dataproc_pyspark_runner
diff --git a/dags/burnham.py b/dags/burnham.py
index f532a8755..b3c9dcec8 100644
--- a/dags/burnham.py
+++ b/dags/burnham.py
@@ -6,13 +6,13 @@
 import datetime
 import json
 import logging
-import uuid
 import time
-
+import uuid
 from dataclasses import dataclass
 
 from airflow import DAG
 from airflow.operators.python import PythonOperator
+
 from operators.bq_sensor import BigQuerySQLSensorOperator
 from operators.gcp_container_operator import GKEPodOperator
 from utils.tags import Tag
@@ -431,7 +431,8 @@ def burnham_run(
     gke_namespace=DEFAULT_GKE_NAMESPACE,
     **kwargs,
 ):
-    """Create a new GKEPodOperator that runs the burnham Docker image.
+    """
+    Create a new GKEPodOperator that runs the burnham Docker image.
 
     :param str task_id:                         [Required] ID for the task
     :param str burnham_test_run:                [Required] UUID for the test run
@@ -482,7 +483,8 @@ def burnham_run(
 
 
 def burnham_sensor(task_id, sql, gcp_conn_id=DEFAULT_GCP_CONN_ID, **kwargs):
-    """Create a new BigQuerySQLSensorOperator that checks for burnham data.
+    """
+    Create a new BigQuerySQLSensorOperator that checks for burnham data.
 
     :param str task_id:                 [Required] ID for the task
     :param str sql:                     [Required] SQL for the sensor
@@ -512,7 +514,8 @@ def burnham_bigquery_run(
     gke_namespace=DEFAULT_GKE_NAMESPACE,
     **kwargs,
 ):
-    """Create a new GKEPodOperator that runs the burnham-bigquery Docker image.
+    """
+    Create a new GKEPodOperator that runs the burnham-bigquery Docker image.
 
     :param str task_id:                 [Required] ID for the task
     :param str project_id:              [Required] Project ID where target table lives
@@ -558,7 +561,8 @@ def burnham_bigquery_run(
 
 
 def encode_test_scenarios(test_scenarios):
-    """Encode the given test scenarios as a str.
+    """
+    Encode the given test scenarios as a str.
 
     :param List[Dict[str, object]] test_scenarios:  [Required] ID for the task
     :return: str
@@ -570,7 +574,8 @@ def encode_test_scenarios(test_scenarios):
 
 
 def do_sleep(minutes):
-    """Sleep for the given number of minutes.
+    """
+    Sleep for the given number of minutes.
 
     Writes out an update every minute to give some indication of aliveness.
     """
@@ -581,7 +586,8 @@ def do_sleep(minutes):
 
 
 def sleep_task(minutes, task_id):
-    """Return an operator that sleeps for a certain number of minutes.
+    """
+    Return an operator that sleeps for a certain number of minutes.
 
     :param int    minutes: [Required] Number of minutes to sleep
     :param string task_id: [Required] ID for the task
@@ -592,7 +598,7 @@ def sleep_task(minutes, task_id):
         task_id=task_id,
         depends_on_past=False,
         python_callable=do_sleep,
-        op_kwargs=dict(minutes=minutes),
+        op_kwargs={"minutes": minutes},
     )
 
 
@@ -605,7 +611,6 @@ def sleep_task(minutes, task_id):
     doc_md=DOCS,
     tags=tags,
 ) as dag:
-
     # Generate a UUID for this test run
     generate_burnham_test_run_uuid = PythonOperator(
         task_id="generate_burnham_test_run_uuid",
diff --git a/dags/clean_gke_pods.py b/dags/clean_gke_pods.py
index 1076d0b24..8e6078519 100644
--- a/dags/clean_gke_pods.py
+++ b/dags/clean_gke_pods.py
@@ -1,6 +1,7 @@
 from datetime import datetime, timedelta
 
 from airflow import DAG
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.tags import Tag
 
diff --git a/dags/crash_symbolication.py b/dags/crash_symbolication.py
index 3848c7ae7..c1ba6168c 100644
--- a/dags/crash_symbolication.py
+++ b/dags/crash_symbolication.py
@@ -11,6 +11,7 @@
 from airflow.operators.subdag import SubDagOperator
 from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
 from airflow.sensors.external_task import ExternalTaskSensor
+
 from utils.constants import ALLOWED_STATES, FAILED_STATES
 from utils.dataproc import get_dataproc_parameters, moz_dataproc_pyspark_runner
 from utils.tags import Tag
diff --git a/dags/data_monitoring.py b/dags/data_monitoring.py
index d1a46e85c..62a6d3573 100644
--- a/dags/data_monitoring.py
+++ b/dags/data_monitoring.py
@@ -1,9 +1,9 @@
 from datetime import datetime
 
 from airflow import DAG
-from operators.gcp_container_operator import GKEPodOperator
 
-from dags.utils.tags import Tag
+from operators.gcp_container_operator import GKEPodOperator
+from utils.tags import Tag
 
 DOCS = """\
 This DAG is related to data monitoring project it is still under development.
@@ -57,7 +57,6 @@
     tags=TAGS,
     catchup=True,
 ) as dag:
-
     for target_dataset in TARGET_DATASETS:
         project_id, dataset, table = target_dataset.split(".")
 
diff --git a/dags/dim_active_users_aggregates.py b/dags/dim_active_users_aggregates.py
index 90ab8b554..ff88e8073 100644
--- a/dags/dim_active_users_aggregates.py
+++ b/dags/dim_active_users_aggregates.py
@@ -15,6 +15,7 @@
 from airflow import DAG
 from airflow.operators.empty import EmptyOperator
 from airflow.sensors.external_task import ExternalTaskSensor
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.constants import ALLOWED_STATES, FAILED_STATES
 from utils.tags import Tag
diff --git a/dags/experiment_auto_sizing.py b/dags/experiment_auto_sizing.py
index f1907d3cf..d367589e6 100644
--- a/dags/experiment_auto_sizing.py
+++ b/dags/experiment_auto_sizing.py
@@ -10,6 +10,7 @@
 
 from airflow import DAG
 from airflow.sensors.external_task import ExternalTaskSensor
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.constants import ALLOWED_STATES, FAILED_STATES
 from utils.tags import Tag
@@ -34,7 +35,6 @@
     doc_md=__doc__,
     tags=tags,
 ) as dag:
-
     # Built from repo https://github.com/mozilla/auto-sizing
     auto_sizing_image = "gcr.io/moz-fx-data-experiments/auto_sizing:latest"
 
diff --git a/dags/firefox_public_data_report.py b/dags/firefox_public_data_report.py
index 276fadac6..c2ecbac7a 100644
--- a/dags/firefox_public_data_report.py
+++ b/dags/firefox_public_data_report.py
@@ -10,6 +10,7 @@
 from airflow import DAG
 from airflow.operators.subdag import SubDagOperator
 from airflow.sensors.external_task import ExternalTaskSensor
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.constants import ALLOWED_STATES, FAILED_STATES
 from utils.dataproc import (
diff --git a/dags/fivetran_acoustic.py b/dags/fivetran_acoustic.py
index c05f9617f..bda7bfd83 100644
--- a/dags/fivetran_acoustic.py
+++ b/dags/fivetran_acoustic.py
@@ -1,5 +1,5 @@
 from datetime import datetime, timedelta
-from typing import Any, Dict
+from typing import Any
 
 from airflow import DAG
 from airflow.hooks.base import BaseHook
@@ -8,13 +8,13 @@
 from fivetran_provider.operators.fivetran import FivetranOperator
 from fivetran_provider.sensors.fivetran import FivetranSensor
 
-from dags.utils.acoustic.acoustic_client import AcousticClient
-from dags.utils.callbacks import retry_tasks_callback
-from dags.utils.tags import Tag
+from utils.acoustic.acoustic_client import AcousticClient
+from utils.callbacks import retry_tasks_callback
+from utils.tags import Tag
 
 
 def _generate_acoustic_report(
-    conn_id: str, report_type: str, config: Dict[Any, Any], *args, **kwargs
+    conn_id: str, report_type: str, config: dict[Any, Any], *args, **kwargs
 ):
     """Retrieve Acoustic connection details from Airflow, instantiate AcousticClient and generate report."""
 
diff --git a/dags/glam.py b/dags/glam.py
index a37403ab5..3d202dd7c 100644
--- a/dags/glam.py
+++ b/dags/glam.py
@@ -14,12 +14,13 @@
 from airflow.operators.subdag import SubDagOperator
 from airflow.sensors.external_task import ExternalTaskMarker, ExternalTaskSensor
 from airflow.utils.task_group import TaskGroup
-from glam_subdags.extract import extract_user_counts, extracts_subdag
-from glam_subdags.general import repeated_subdag
-from glam_subdags.generate_query import generate_and_run_desktop_query
-from glam_subdags.histograms import histogram_aggregates_subdag
+
 from utils.constants import ALLOWED_STATES, FAILED_STATES
 from utils.gcp import bigquery_etl_query, gke_command
+from utils.glam_subdags.extract import extract_user_counts, extracts_subdag
+from utils.glam_subdags.general import repeated_subdag
+from utils.glam_subdags.generate_query import generate_and_run_desktop_query
+from utils.glam_subdags.histograms import histogram_aggregates_subdag
 from utils.tags import Tag
 
 project_id = "moz-fx-data-shared-prod"
diff --git a/dags/glam_dev.py b/dags/glam_dev.py
index 1f9c53efb..af4232810 100644
--- a/dags/glam_dev.py
+++ b/dags/glam_dev.py
@@ -14,11 +14,12 @@
 
 from airflow import DAG
 from airflow.operators.subdag import SubDagOperator
-from glam_subdags.general import repeated_subdag
-from glam_subdags.generate_query import generate_and_run_desktop_query
-from glam_subdags.histograms import histogram_aggregates_subdag
-from glam_subdags.probe_hotlist import update_hotlist
+
 from utils.gcp import bigquery_etl_query, gke_command
+from utils.glam_subdags.general import repeated_subdag
+from utils.glam_subdags.generate_query import generate_and_run_desktop_query
+from utils.glam_subdags.histograms import histogram_aggregates_subdag
+from utils.glam_subdags.probe_hotlist import update_hotlist
 from utils.tags import Tag
 
 prod_project_id = "moz-fx-data-shared-prod"
diff --git a/dags/glam_fenix.py b/dags/glam_fenix.py
index 74a736be6..085f6a6fc 100644
--- a/dags/glam_fenix.py
+++ b/dags/glam_fenix.py
@@ -1,5 +1,6 @@
 """
-Firefox for Android ETL for https://glam.telemetry.mozilla.org/
+Firefox for Android ETL for https://glam.telemetry.mozilla.org/.
+
 Generates and runs a series of BQ queries, see
 [bigquery_etl/glam](https://github.com/mozilla/bigquery-etl/tree/main/bigquery_etl/glam)
 in bigquery-etl and the
@@ -8,20 +9,19 @@
 """
 
 from datetime import datetime, timedelta
+from functools import partial
 
 from airflow import DAG
 from airflow.operators.empty import EmptyOperator
-from airflow.sensors.external_task import ExternalTaskMarker
-from airflow.sensors.external_task import ExternalTaskSensor
+from airflow.sensors.external_task import ExternalTaskMarker, ExternalTaskSensor
 from airflow.utils.task_group import TaskGroup
 
-from glam_subdags.generate_query import (
+from utils.constants import ALLOWED_STATES, FAILED_STATES
+from utils.gcp import gke_command
+from utils.glam_subdags.generate_query import (
     generate_and_run_glean_queries,
     generate_and_run_glean_task,
 )
-from utils.constants import ALLOWED_STATES, FAILED_STATES
-from utils.gcp import gke_command
-from functools import partial
 from utils.tags import Tag
 
 default_args = {
@@ -93,10 +93,10 @@
     )
 
     pre_import = EmptyOperator(
-        task_id='pre_import',
+        task_id="pre_import",
     )
 
-    with TaskGroup('glam_fenix_external') as glam_fenix_external:
+    with TaskGroup("glam_fenix_external") as glam_fenix_external:
         ExternalTaskMarker(
             task_id="glam_glean_imports__wait_for_fenix",
             external_dag_id="glam_glean_imports",
@@ -112,8 +112,7 @@
             task_id=f"daily_{product}",
             product=product,
             destination_project_id=PROJECT,
-            env_vars=dict(STAGE="daily"),
-
+            env_vars={"STAGE": "daily"},
         )
         mapping[product] = query
         wait_for_copy_deduplicate >> query
@@ -127,12 +126,12 @@
             generate_and_run_glean_task,
             product=product,
             destination_project_id=PROJECT,
-            env_vars=dict(STAGE="incremental"),
-
+            env_vars={"STAGE": "incremental"},
+        )
+        view, init, query = (
+            partial(func, task_type=task_type)
+            for task_type in ["view", "init", "query"]
         )
-        view, init, query = [
-            partial(func, task_type=task_type) for task_type in ["view", "init", "query"]
-        ]
 
         # stage 1 - incremental
         clients_daily_histogram_aggregates = view(
@@ -162,8 +161,12 @@
         scalar_probe_counts = query(task_name=f"{product}__scalar_probe_counts_v1")
         scalar_percentile = query(task_name=f"{product}__scalar_percentiles_v1")
 
-        histogram_bucket_counts = query(task_name=f"{product}__histogram_bucket_counts_v1")
-        histogram_probe_counts = query(task_name=f"{product}__histogram_probe_counts_v1")
+        histogram_bucket_counts = query(
+            task_name=f"{product}__histogram_bucket_counts_v1"
+        )
+        histogram_probe_counts = query(
+            task_name=f"{product}__histogram_probe_counts_v1"
+        )
         histogram_percentiles = query(task_name=f"{product}__histogram_percentiles_v1")
 
         probe_counts = view(task_name=f"{product}__view_probe_counts_v1")
@@ -186,7 +189,6 @@
             command=["script/glam/export_csv"],
             docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
             gcp_conn_id="google_cloud_airflow_gke",
-
         )
 
         # set all of the dependencies for all of the tasks
@@ -228,5 +230,11 @@
             >> probe_counts
         )
         probe_counts >> sample_counts >> extract_probe_counts >> export >> pre_import
-        clients_scalar_aggregate >> user_counts >> extract_user_counts >> export >> pre_import
+        (
+            clients_scalar_aggregate
+            >> user_counts
+            >> extract_user_counts
+            >> export
+            >> pre_import
+        )
         clients_histogram_aggregate >> export >> pre_import
diff --git a/dags/glam_fog.py b/dags/glam_fog.py
index 6482f6d17..87d50d5d1 100644
--- a/dags/glam_fog.py
+++ b/dags/glam_fog.py
@@ -5,12 +5,13 @@
 from airflow.operators.empty import EmptyOperator
 from airflow.sensors.external_task import ExternalTaskMarker, ExternalTaskSensor
 from airflow.utils.task_group import TaskGroup
-from glam_subdags.generate_query import (
+
+from utils.constants import ALLOWED_STATES, FAILED_STATES
+from utils.gcp import gke_command
+from utils.glam_subdags.generate_query import (
     generate_and_run_glean_queries,
     generate_and_run_glean_task,
 )
-from utils.constants import ALLOWED_STATES, FAILED_STATES
-from utils.gcp import gke_command
 from utils.tags import Tag
 
 default_args = {
diff --git a/dags/glam_glean_imports.py b/dags/glam_glean_imports.py
index d78f6ce49..6bad0424a 100644
--- a/dags/glam_glean_imports.py
+++ b/dags/glam_glean_imports.py
@@ -6,6 +6,7 @@
 from airflow.models import Variable
 from airflow.sensors.external_task import ExternalTaskSensor
 from airflow.utils.task_group import TaskGroup
+
 from operators.gcp_container_operator import GKENatPodOperator
 from utils.constants import ALLOWED_STATES, FAILED_STATES
 from utils.tags import Tag
diff --git a/dags/jetstream.py b/dags/jetstream.py
index 90b73de01..3170d6b26 100644
--- a/dags/jetstream.py
+++ b/dags/jetstream.py
@@ -16,6 +16,7 @@
 
 from airflow import DAG
 from airflow.sensors.external_task import ExternalTaskSensor
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.constants import ALLOWED_STATES, FAILED_STATES
 from utils.tags import Tag
@@ -43,7 +44,6 @@
     doc_md=__doc__,
     tags=tags,
 ) as dag:
-
     # Built from repo https://github.com/mozilla/jetstream
     jetstream_image = "gcr.io/moz-fx-data-experiments/jetstream:latest"
 
diff --git a/dags/merino_jobs.py b/dags/merino_jobs.py
index b8cdbe485..a55f09964 100644
--- a/dags/merino_jobs.py
+++ b/dags/merino_jobs.py
@@ -1,9 +1,10 @@
 import datetime
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from airflow import DAG
 from airflow.hooks.base import BaseHook
 from airflow.operators.email import EmailOperator
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.tags import Tag
 
@@ -16,7 +17,7 @@
 
 
 def merino_job(
-    name: str, arguments: List[str], env_vars: Optional[Dict[str, Any]] = None, **kwargs
+    name: str, arguments: list[str], env_vars: dict[str, Any] | None = None, **kwargs
 ):
     default_env_vars = {"MERINO_ENV": "production"}
     if env_vars is None:
diff --git a/dags/operational_monitoring.py b/dags/operational_monitoring.py
index 7454ba5ee..4e0e857f2 100644
--- a/dags/operational_monitoring.py
+++ b/dags/operational_monitoring.py
@@ -1,4 +1,4 @@
-from datetime import timedelta, datetime
+from datetime import datetime, timedelta
 
 from airflow import DAG
 from airflow.sensors.external_task import ExternalTaskSensor
@@ -11,7 +11,7 @@
 
 
 This DAG schedules queries for populating datasets used for operational monitoring.
-The queries are generated via [`opmon`](https://github.com/mozilla/opmon). 
+The queries are generated via [`opmon`](https://github.com/mozilla/opmon).
 
 #### Owner
 
@@ -52,7 +52,7 @@
         email=["ascholtz@mozilla.com"],
         arguments=[
             "--log_to_bigquery",
-            "run", 
+            "run",
             "--date={{ ds }}",
         ],
         dag=dag,
@@ -64,8 +64,8 @@
         external_task_id="telemetry_derived__clients_daily__v6",
         execution_delta=timedelta(hours=2),
         mode="reschedule",
-        allowed_states=['success'],
-        failed_states=['failed', 'upstream_failed', 'skipped'],
+        allowed_states=["success"],
+        failed_states=["failed", "upstream_failed", "skipped"],
         pool="DATA_ENG_EXTERNALTASKSENSOR",
         email_on_retry=False,
         dag=dag,
@@ -77,8 +77,8 @@
         external_task_id="search_derived__search_clients_daily__v8",
         execution_delta=timedelta(hours=1),
         mode="reschedule",
-        allowed_states=['success'],
-        failed_states=['failed', 'upstream_failed', 'skipped'],
+        allowed_states=["success"],
+        failed_states=["failed", "upstream_failed", "skipped"],
         pool="DATA_ENG_EXTERNALTASKSENSOR",
         email_on_retry=False,
         dag=dag,
diff --git a/dags/overwatch.py b/dags/overwatch.py
index 1b196e6e0..c9b179772 100644
--- a/dags/overwatch.py
+++ b/dags/overwatch.py
@@ -13,6 +13,7 @@
 from datetime import datetime
 
 from airflow import DAG
+
 from operators.gcp_container_operator import GKEPodOperator
 
 default_args = {
diff --git a/dags/partybal.py b/dags/partybal.py
index 847ed3d96..7d2d540df 100644
--- a/dags/partybal.py
+++ b/dags/partybal.py
@@ -16,6 +16,7 @@
 from datetime import datetime, timedelta
 
 from airflow import DAG
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.tags import Tag
 
diff --git a/dags/probe_scraper.py b/dags/probe_scraper.py
index a921aaee9..9eb62d92c 100644
--- a/dags/probe_scraper.py
+++ b/dags/probe_scraper.py
@@ -10,6 +10,7 @@
 from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
 from airflow.providers.http.operators.http import SimpleHttpOperator
 from airflow.utils.weekday import WeekDay
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.tags import Tag
 
diff --git a/dags/search_term_data_validation_v2.py b/dags/search_term_data_validation_v2.py
index 266e9d20a..1c5d16583 100644
--- a/dags/search_term_data_validation_v2.py
+++ b/dags/search_term_data_validation_v2.py
@@ -11,6 +11,7 @@
 from datetime import datetime, timedelta
 
 from airflow import DAG
+
 from utils.gcp import gke_command
 from utils.tags import Tag
 
diff --git a/dags/socorro_import.py b/dags/socorro_import.py
index 30dc2964a..46eee2237 100644
--- a/dags/socorro_import.py
+++ b/dags/socorro_import.py
@@ -1,14 +1,13 @@
+from datetime import datetime, timedelta
+
 from airflow import DAG
 from airflow.operators.subdag import SubDagOperator
-
 from airflow.providers.google.cloud.operators.cloud_storage_transfer_service import (
     CloudDataTransferServiceS3ToGCSOperator,
 )
 from airflow.sensors.external_task import ExternalTaskMarker
 from airflow.utils.task_group import TaskGroup
 
-from datetime import datetime, timedelta
-
 from operators.gcp_container_operator import GKEPodOperator
 from utils.dataproc import moz_dataproc_pyspark_runner
 from utils.tags import Tag
@@ -47,8 +46,12 @@
 
 tags = [Tag.ImpactTier.tier_2]
 
-with DAG("socorro_import", default_args=default_args, schedule_interval="@daily", tags=tags,) as dag:
-
+with DAG(
+    "socorro_import",
+    default_args=default_args,
+    schedule_interval="@daily",
+    tags=tags,
+) as dag:
     # Unsalted cluster name so subsequent runs fail if the cluster name exists
     cluster_name = "socorro-import-dataproc-cluster"
 
@@ -101,9 +104,9 @@
                 "--date",
                 "{{ ds_nodash }}",
                 "--source-gcs-path",
-                "gs://{}/v1/crash_report".format(gcs_data_bucket),
+                f"gs://{gcs_data_bucket}/v1/crash_report",
                 "--dest-gcs-path",
-                "gs://{}/{}".format(gcs_data_bucket, dataset),
+                f"gs://{gcs_data_bucket}/{dataset}",
             ],
             idle_delete_ttl=14400,
             num_workers=8,
@@ -113,7 +116,6 @@
         ),
     )
 
-
     bq_gcp_conn_id = "google_cloud_airflow_gke"
 
     dest_s3_key = "s3://telemetry-parquet"
@@ -121,7 +123,7 @@
     # Not using load_to_bigquery since our source data is on GCS.
     # We do use the parquet2bigquery container to load gcs parquet into bq though.
     bq_dataset = "telemetry_derived"
-    bq_table_name = "{}_{}".format(dataset, dataset_version)
+    bq_table_name = f"{dataset}_{dataset_version}"
 
     docker_image = "docker.io/mozilla/parquet2bigquery:20190722"
 
@@ -141,14 +143,15 @@
 
     # We remove the current date partition for idempotency.
     table_name = "{}:{}.{}${{{{ds_nodash}}}}".format(
-            "{{ var.value.gcp_shared_prod_project }}", bq_dataset, bq_table_name)
+        "{{ var.value.gcp_shared_prod_project }}", bq_dataset, bq_table_name
+    )
 
     remove_bq_table_partition = GKEPodOperator(
         task_id="remove_socorro_crash_bq_table_partition",
         gcp_conn_id=bq_gcp_conn_id,
         name="remove_socorro_crash_bq_table_partition",
         image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
-        arguments=["bq", "rm", "-f", "--table", table_name]
+        arguments=["bq", "rm", "-f", "--table", table_name],
     )
 
     bq_load = GKEPodOperator(
@@ -160,7 +163,7 @@
         env_vars={"GOOGLE_CLOUD_PROJECT": "{{ var.value.gcp_shared_prod_project }}"},
     )
 
-    with TaskGroup('socorro_external') as socorro_external:
+    with TaskGroup("socorro_external") as socorro_external:
         ExternalTaskMarker(
             task_id="crash_symbolication__wait_for_socorro_import",
             external_dag_id="crash_symbolication",
diff --git a/dags/taar_daily.py b/dags/taar_daily.py
index 8f2cd6442..36706508c 100644
--- a/dags/taar_daily.py
+++ b/dags/taar_daily.py
@@ -20,6 +20,7 @@
 from airflow.models import Variable
 from airflow.operators.subdag import SubDagOperator
 from airflow.sensors.external_task import ExternalTaskSensor
+
 from operators.gcp_container_operator import GKEPodOperator
 from utils.constants import ALLOWED_STATES, FAILED_STATES
 from utils.dataproc import (
diff --git a/dags/taar_weekly.py b/dags/taar_weekly.py
index 6bfec44f7..9edf7c452 100644
--- a/dags/taar_weekly.py
+++ b/dags/taar_weekly.py
@@ -1,5 +1,5 @@
 """
-This configures a weekly DAG to run the TAAR Ensemble job off.
+Configure a weekly DAG to run the TAAR Ensemble job off.
 
 For context, see https://github.com/mozilla/taar
 
@@ -10,16 +10,16 @@
 any actions for the past failed DAG runs.
 """
 
-from airflow import DAG
 from datetime import datetime, timedelta
-from airflow.operators.subdag import SubDagOperator
+
+from airflow import DAG
 from airflow.models import Variable
+from airflow.operators.subdag import SubDagOperator
 
-from operators.gcp_container_operator import GKEPodOperator  # noqa
+from operators.gcp_container_operator import GKEPodOperator
 from utils.dataproc import moz_dataproc_pyspark_runner
 from utils.tags import Tag
 
-
 taar_ensemble_cluster_name = "dataproc-taar-ensemble"
 taar_gcpdataproc_conn_id = "google_cloud_airflow_dataproc"
 taar_gcpdataproc_project_id = "airflow-dataproc"
@@ -32,9 +32,7 @@
 TAAR_DATAFLOW_SERVICE_ACCOUNT = Variable.get("taar_dataflow_service_account_email")
 
 # This uses a circleci built docker image from github.com/mozilla/taar_gcp_etl
-TAAR_ETL_CONTAINER_IMAGE = (
-    "gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.6.5"
-)
+TAAR_ETL_CONTAINER_IMAGE = "gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.6.5"
 DELETE_DAYS = 29
 
 
@@ -43,7 +41,7 @@
     "email": [
         "hwoo@mozilla.com",
         "epavlov@mozilla.com",
-        "telemetry-alerts@mozilla.com"
+        "telemetry-alerts@mozilla.com",
     ],
     "depends_on_past": False,
     "start_date": datetime(2020, 4, 4),
@@ -56,7 +54,9 @@
 tags = [Tag.ImpactTier.tier_2]
 
 taar_weekly = DAG(
-    "taar_weekly", default_args=default_args_weekly, schedule_interval="@weekly",
+    "taar_weekly",
+    default_args=default_args_weekly,
+    schedule_interval="@weekly",
     doc_md=__doc__,
     tags=tags,
 )
@@ -84,7 +84,7 @@ def taar_profile_common_args():
         "--bigtable-instance-id=%s" % TAAR_BIGTABLE_INSTANCE_ID,
         "--sample-rate=1.0",
         "--subnetwork=%s" % TAAR_DATAFLOW_SUBNETWORK,
-        "--dataflow-service-account=%s" % TAAR_DATAFLOW_SERVICE_ACCOUNT
+        "--dataflow-service-account=%s" % TAAR_DATAFLOW_SERVICE_ACCOUNT,
     ]
 
 
@@ -100,7 +100,7 @@ def taar_profile_common_args():
     task_id="dump_bq_to_tmp_table",
     name="dump_bq_to_tmp_table",
     image=TAAR_ETL_CONTAINER_IMAGE,
-    arguments=taar_profile_common_args() + ["--fill-bq",],
+    arguments=[*taar_profile_common_args(), "--fill-bq"],
     dag=taar_weekly,
 )
 
@@ -108,7 +108,7 @@ def taar_profile_common_args():
     task_id="extract_bq_tmp_to_gcs_avro",
     name="extract_bq_tmp_to_gcs_avro",
     image=TAAR_ETL_CONTAINER_IMAGE,
-    arguments=taar_profile_common_args() + ["--bq-to-gcs",],
+    arguments=[*taar_profile_common_args(), "--bq-to-gcs"],
     dag=taar_weekly,
 )
 
@@ -121,7 +121,7 @@ def taar_profile_common_args():
     # Where the pod continues to run, but airflow loses its connection and sets the status to Failed
     # See: https://github.com/mozilla/telemetry-airflow/issues/844
     get_logs=False,
-    arguments=taar_profile_common_args() + ["--gcs-to-bigtable",],
+    arguments=[*taar_profile_common_args(), "--gcs-to-bigtable"],
     dag=taar_weekly,
 )
 
@@ -129,9 +129,12 @@ def taar_profile_common_args():
     task_id="delete_opt_out_users_from_bigtable",
     name="delete_opt_out_users_from_bigtable",
     image=TAAR_ETL_CONTAINER_IMAGE,
-    arguments=taar_profile_common_args() + ["--bigtable-delete-opt-out",
-                                            "--delete-opt-out-days=%s" % DELETE_DAYS],
-    dag=taar_weekly
+    arguments=[
+        *taar_profile_common_args(),
+        "--bigtable-delete-opt-out",
+        "--delete-opt-out-days=%s" % DELETE_DAYS,
+    ],
+    dag=taar_weekly,
 )
 
 wipe_gcs_bucket_cleanup = GKEPodOperator(
@@ -146,7 +149,7 @@ def taar_profile_common_args():
     task_id="wipe_bigquery_tmp_table",
     name="wipe_bigquery_tmp_table",
     image=TAAR_ETL_CONTAINER_IMAGE,
-    arguments=taar_profile_common_args() + ["--wipe-bigquery-tmp-table",],
+    arguments=[*taar_profile_common_args(), "--wipe-bigquery-tmp-table"],
     dag=taar_weekly,
 )
 
@@ -177,7 +180,7 @@ def taar_profile_common_args():
         ],
         additional_metadata={
             "PIP_PACKAGES": "mozilla-taar3==1.0.7 python-decouple==3.1 click==7.0 "
-                            "google-cloud-storage==1.19.1"
+            "google-cloud-storage==1.19.1"
         },
         optional_components=["ANACONDA", "JUPYTER"],
         py_args=[
@@ -201,11 +204,13 @@ def taar_profile_common_args():
 )
 
 
-wipe_gcs_bucket >> \
-dump_bq_to_tmp_table >> \
-extract_bq_tmp_to_gcs_avro >> \
-dataflow_import_avro_to_bigtable >> \
-delete_optout >> \
-wipe_gcs_bucket_cleanup >> \
-wipe_bigquery_tmp_table >> \
-taar_ensemble
+(
+    wipe_gcs_bucket
+    >> dump_bq_to_tmp_table
+    >> extract_bq_tmp_to_gcs_avro
+    >> dataflow_import_avro_to_bigtable
+    >> delete_optout
+    >> wipe_gcs_bucket_cleanup
+    >> wipe_bigquery_tmp_table
+    >> taar_ensemble
+)
diff --git a/dags/utils/constants.py b/dags/utils/constants.py
deleted file mode 100644
index a8618d6db..000000000
--- a/dags/utils/constants.py
+++ /dev/null
@@ -1,11 +0,0 @@
-DS_WEEKLY = (
-    '{% if dag_run.external_trigger %}'
-        '{{ ds_nodash }}'
-    '{% else %}'
-        '{{ macros.ds_format(macros.ds_add(ds, 6), "%Y-%m-%d", "%Y%m%d") }}'
-    '{% endif %}'
-)
-
-FAILED_STATES = ['failed', 'upstream_failed', 'skipped']
-
-ALLOWED_STATES = ['success']
diff --git a/docker-compose.yml b/docker-compose.yml
index 20d2b55c4..a91144bdf 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -27,6 +27,7 @@ x-airflow-common:
     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
     AIRFLOW__CORE__FERNET_KEY: $FERNET_KEY
+    AIRFLOW__CORE__DAGS_FOLDER: "/opt/airflow/dags"
     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.default'
@@ -40,6 +41,8 @@ x-airflow-common:
     - ./dags:/opt/airflow/dags
     - ./logs:/opt/airflow/logs
     - ./plugins:/opt/airflow/plugins
+    - ./utils:/opt/airflow/utils
+    - ./operators:/opt/airflow/operators
     - ./resources/dev_webserver_config.py:/opt/airflow/webserver_config.py
     - ./resources/dev_connections.json:/opt/airflow/dev_connections.json
     - ./resources/dev_variables.json:/opt/airflow/dev_variables.json
diff --git a/dags/glam_subdags/__init__.py b/operators/__init__.py
similarity index 100%
rename from dags/glam_subdags/__init__.py
rename to operators/__init__.py
diff --git a/dags/operators/bq_sensor.py b/operators/bq_sensor.py
similarity index 100%
rename from dags/operators/bq_sensor.py
rename to operators/bq_sensor.py
diff --git a/dags/operators/gcp_container_operator.py b/operators/gcp_container_operator.py
similarity index 59%
rename from dags/operators/gcp_container_operator.py
rename to operators/gcp_container_operator.py
index aaa24d3cd..c6939acbe 100644
--- a/dags/operators/gcp_container_operator.py
+++ b/operators/gcp_container_operator.py
@@ -1,48 +1,55 @@
-from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator as UpstreamGKEPodOperator
+from airflow.providers.google.cloud.operators.kubernetes_engine import (
+    GKEStartPodOperator as UpstreamGKEPodOperator,
+)
 
 
 class GKEPodOperator(UpstreamGKEPodOperator):
     """
+    Based off GKEStartPodOperator.
+
     - This will now have the same defaults as GkeNatPodOperator pointing to the newer GKE cluster
     - In 1.10.x this inherited from upstream GKEPodOperator, rather than GKEStartPodOperator(v2)
     - In 1.10.x we needed to override the execute and helper methods to set an environment
-variable for authentication to work (CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE). Fixed in v2
+    variable for authentication to work (CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE). Fixed in v2
     - We will keep this class and call the upstream GkeStartPodOperator now, because
-numerous places in our code references it still
+    numerous places in our code references it still
     - Overrides init to default image_pull_policy=Always, in_cluster=False,
-do_xcom_push=False and GKE params
+    do_xcom_push=False and GKE params
     - Defaults reattach_on_restart=False to address a 1.10.12 regression where GkePodOperators
         reruns will simply attach to an existing pod and not perform any new work.
     - Hard sets reattach_on_restart=False when do_xcom_push=True to address an error
         Retrying a failed task with do_xcom_push=True causes airflow to reattach to the pod
         eventually causing a 'Handshake status 500 Internal Server Error'. Logs will indicate
         'found a running pod with ... different try_number. Will attach to this pod and monitor
-        instead of starting new one'
+        instead of starting new one'.
 
     """
-    def __init__(self,
-                 image_pull_policy='Always',
-                 in_cluster=False,
-                 do_xcom_push=False,
-                 reattach_on_restart=False,
-                 # Defined in Airflow's UI -> Admin -> Connections
-                 gcp_conn_id='google_cloud_airflow_gke',
-                 project_id='moz-fx-data-airflow-gke-prod',
-                 location='us-west1',
-                 cluster_name='workloads-prod-v1',
-                 namespace='default',
-                 *args,
-                 **kwargs):
 
+    def __init__(
+        self,
+        image_pull_policy="Always",
+        in_cluster=False,
+        do_xcom_push=False,
+        reattach_on_restart=False,
+        # Defined in Airflow's UI -> Admin -> Connections
+        gcp_conn_id="google_cloud_airflow_gke",
+        project_id="moz-fx-data-airflow-gke-prod",
+        location="us-west1",
+        cluster_name="workloads-prod-v1",
+        namespace="default",
+        *args,
+        **kwargs
+    ):
         # Hard set reattach_on_restart = False when do_xcom_push is enabled.
         if do_xcom_push:
             reattach_on_restart = False
 
         # GKE node pool autoscaling is failing to scale down when completed pods exist on the node
         # in Completed states, due to the pod not being replicated. E.g. behind an rc or deployment.
-        annotations = {'cluster-autoscaler.kubernetes.io/safe-to-evict': 'true'}
+        annotations = {"cluster-autoscaler.kubernetes.io/safe-to-evict": "true"}
 
-        super(GKEPodOperator, self).__init__(
+        super().__init__(
+            *args,
             image_pull_policy=image_pull_policy,
             in_cluster=in_cluster,
             do_xcom_push=do_xcom_push,
@@ -53,38 +60,38 @@ def __init__(self,
             location=location,
             cluster_name=cluster_name,
             namespace=namespace,
-            *args,
-            **kwargs)
+            **kwargs
+        )
 
 
 class GKENatPodOperator(UpstreamGKEPodOperator):
-    """
-    This class is similar to the one defined above but has defaults to use a different GKE Cluster
-    With a NAT
-    """
-    def __init__(self,
-                 image_pull_policy='Always',
-                 in_cluster=False,
-                 do_xcom_push=False,
-                 reattach_on_restart=False,
-                 # Defined in Airflow's UI -> Admin -> Connections
-                 gcp_conn_id='google_cloud_airflow_gke',
-                 project_id='moz-fx-data-airflow-gke-prod',
-                 location='us-west1',
-                 cluster_name='workloads-prod-v1',
-                 namespace='default',
-                 *args,
-                 **kwargs):
+    """Similar to the one defined above but has defaults to use a different GKE Cluster With a NAT."""
 
+    def __init__(
+        self,
+        image_pull_policy="Always",
+        in_cluster=False,
+        do_xcom_push=False,
+        reattach_on_restart=False,
+        # Defined in Airflow's UI -> Admin -> Connections
+        gcp_conn_id="google_cloud_airflow_gke",
+        project_id="moz-fx-data-airflow-gke-prod",
+        location="us-west1",
+        cluster_name="workloads-prod-v1",
+        namespace="default",
+        *args,
+        **kwargs
+    ):
         # Hard set reattach_on_restart = False when do_xcom_push is enabled.
         if do_xcom_push:
             reattach_on_restart = False
 
         # GKE node pool autoscaling is failing to scale down when completed pods exist on the node
         # in Completed states, due to the pod not being replicated. E.g. behind an rc or deployment.
-        annotations = {'cluster-autoscaler.kubernetes.io/safe-to-evict': 'true'}
+        annotations = {"cluster-autoscaler.kubernetes.io/safe-to-evict": "true"}
 
-        super(GKENatPodOperator, self).__init__(
+        super().__init__(
+            *args,
             image_pull_policy=image_pull_policy,
             in_cluster=in_cluster,
             do_xcom_push=do_xcom_push,
@@ -95,5 +102,5 @@ def __init__(self,
             location=location,
             cluster_name=cluster_name,
             namespace=namespace,
-            *args,
-            **kwargs)
+            **kwargs
+        )
diff --git a/plugins/backfill/main.py b/plugins/backfill/main.py
index ad326b9b9..289f40049 100644
--- a/plugins/backfill/main.py
+++ b/plugins/backfill/main.py
@@ -1,60 +1,54 @@
-# -*- coding: utf-8 -*-
 # Modified from https://github.com/AnkurChoraywal/airflow-backfill-util.git
 
 # Inbuilt Imports
-import os
-import json
-import logging
 import datetime
+import json
+import os
 import re
-from typing import List
 
 # Custom Imports
 import flask
 from flask import request
-from flask_admin import BaseView, expose
-from flask_appbuilder import expose as app_builder_expose, BaseView as AppBuilderBaseView, has_access
-from airflow import configuration
-
+from flask_admin import expose
+from flask_appbuilder import BaseView as AppBuilderBaseView
+from flask_appbuilder import expose as app_builder_expose
 from shelljob import proc
-from dags.utils.backfill import BackfillParams
+
+from utils.backfill import BackfillParams
 
 # Inspired from
 # https://mortoray.com/2014/03/04/http-streaming-of-command-output-in-python-flask/
 # https://www.endpoint.com/blog/2015/01/28/getting-realtime-output-using-python
-# RBAC inspired from 
+# RBAC inspired from
 # https://github.com/teamclairvoyant/airflow-rest-api-plugin
 
 
 # Set your Airflow home path
-if 'AIRFLOW_HOME' in os.environ:
-    airflow_home_path = os.environ['AIRFLOW_HOME']
-else:
-    airflow_home_path = '/tmp'
+airflow_home_path = os.environ.get("AIRFLOW_HOME", "/tmp")
 
 # Local file where history will be stored
-FILE = airflow_home_path + '/logs/backfill_history.txt'
+FILE = airflow_home_path + "/logs/backfill_history.txt"
 
 # RE for remove ansi escape characters
-ansi_escape = re.compile(r'\x1B[@-_][0-?]*[ -/]*[@-~]')
+ansi_escape = re.compile(r"\x1B[@-_][0-?]*[ -/]*[@-~]")
 
 
 # Creating a flask admin BaseView
 def file_ops(mode, data=None):
-    """ File operators - logging/writing and reading user request """
-    if mode == 'r':
+    """File operators - logging/writing and reading user request."""
+    if mode == "r":
         try:
-            with open(FILE, 'r') as f:
+            with open(FILE) as f:
                 return f.read()
-        except IOError:
-            with open(FILE, 'w') as f:
+        except OSError:
+            with open(FILE, "w") as f:
                 return f.close()
 
-    elif mode == 'w' and data:
+    elif mode == "w" and data:
         today = datetime.datetime.now()
         print(os.getcwd())
-        with open(FILE, 'a+') as f:
-            file_data = '{},{}\n'.format(data, today)
+        with open(FILE, "a+") as f:
+            file_data = f"{data},{today}\n"
             f.write(file_data)
             return 1
 
@@ -66,14 +60,14 @@ def get_baseview():
 class Backfill(get_baseview()):
     route_base = "/admin/backfill/"
 
-    @app_builder_expose('/')
+    @app_builder_expose("/")
     def list(self):
         return self.render_template("backfill_page.html")
 
-    @expose('/stream')
-    @app_builder_expose('/stream')
+    @expose("/stream")
+    @app_builder_expose("/stream")
     def stream(self):
-        """ Runs user request and outputs console stream to client"""
+        """Run user request and outputs console stream to client."""
         dag_name = request.args.get("dag_name")
         start_date = request.args.get("start_date")
         end_date = request.args.get("end_date")
@@ -89,43 +83,44 @@ def stream(self):
             task_regex=task_regex,
             dag_name=dag_name,
             start_date=start_date,
-            end_date=end_date)
+            end_date=end_date,
+        )
 
         cmd = backfill_params.generate_backfill_command()
 
-        print('BACKFILL CMD:', cmd)
+        print("BACKFILL CMD:", cmd)
 
         # updates command used in history
-        file_ops('w', ' '.join(cmd))
+        file_ops("w", " ".join(cmd))
 
         g = proc.Group()
         g.run(cmd)
 
         # To print out cleared dry run task instances
-        pattern = '^\<.*\>$'
+        pattern = r"^\<.*\>$"
 
         def read_process():
             while g.is_pending():
                 lines = g.readlines()
-                for proc, line in lines:
+                for _proc, line in lines:
                     line = line.decode("utf-8")
                     result = re.match(pattern, line)
 
                     if result:
                         # Adhere to text/event-stream format
-                        line = line.replace('<', '').replace('>', '')
+                        line = line.replace("<", "").replace(">", "")
                     elif clear and not dry_run:
                         # Special case/hack, airflow tasks clear -y no longer outputs a termination string, so we put one
                         line = "Clear Done"
 
                     yield "data:" + line + "\n\n"
 
-        return flask.Response(read_process(), mimetype='text/event-stream')
+        return flask.Response(read_process(), mimetype="text/event-stream")
 
-    @expose('/background')
-    @app_builder_expose('/background')
+    @expose("/background")
+    @app_builder_expose("/background")
     def background(self):
-        """ Runs user request in background """
+        """Run user request in background."""
         dag_name = request.args.get("dag_name")
         start_date = request.args.get("start_date")
         end_date = request.args.get("end_date")
@@ -134,32 +129,32 @@ def background(self):
         use_task_regex = request.args.get("use_task_regex")
 
         # create a screen id based on timestamp
-        screen_id = datetime.datetime.now().strftime('%s')
+        screen_id = datetime.datetime.now().strftime("%s")
 
         # Prepare the command and execute in background
-        background_cmd = "screen -dmS {} ".format(screen_id)
-        if clear == 'true':
-            background_cmd = background_cmd + 'airflow tasks clear -y '
-        elif clear == 'false':
-            background_cmd = background_cmd + 'airflow dags backfill '
+        background_cmd = f"screen -dmS {screen_id} "
+        if clear == "true":
+            background_cmd = background_cmd + "airflow tasks clear -y "
+        elif clear == "false":
+            background_cmd = background_cmd + "airflow dags backfill "
 
-        if use_task_regex == 'true':
-            background_cmd = background_cmd + "-t {} ".format(task_regex)
+        if use_task_regex == "true":
+            background_cmd = background_cmd + f"-t {task_regex} "
 
-        background_cmd = background_cmd + "-s {} -e {} {}".format(start_date, end_date, dag_name)
+        background_cmd = background_cmd + f"-s {start_date} -e {end_date} {dag_name}"
 
         # Update command in file
-        file_ops('w', background_cmd)
+        file_ops("w", background_cmd)
 
         print(background_cmd)
 
         os.system(background_cmd)
 
-        response = json.dumps({'submitted': True})
-        return flask.Response(response, mimetype='text/json')
+        response = json.dumps({"submitted": True})
+        return flask.Response(response, mimetype="text/json")
 
-    @expose('/history')
-    @app_builder_expose('/history')
+    @expose("/history")
+    @app_builder_expose("/history")
     def history(self):
-        """ Outputs recent user request history """
-        return flask.Response(file_ops('r'), mimetype='text/txt')
+        """Output recent user request history."""
+        return flask.Response(file_ops("r"), mimetype="text/txt")
diff --git a/pyproject.toml b/pyproject.toml
index 6a55a1531..e593760ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 known-third-party = ["airflow"]
 
 [tool.ruff]
-target-version = "py38"
+target-version = "py310"
 select = [
     "E", # pycodestyle
     "W", # pycodestyle
diff --git a/tests/utils/test_backfill.py b/tests/utils/test_backfill.py
index 95bdedd18..ac87b34bb 100644
--- a/tests/utils/test_backfill.py
+++ b/tests/utils/test_backfill.py
@@ -1,6 +1,6 @@
 import pytest
 
-from dags.utils.backfill import BackfillParams
+from utils.backfill import BackfillParams
 
 
 @pytest.fixture()
diff --git a/tests/utils/test_tags.py b/tests/utils/test_tags.py
index f259a7c6d..5a4de2953 100644
--- a/tests/utils/test_tags.py
+++ b/tests/utils/test_tags.py
@@ -1,28 +1,29 @@
 import pytest
-from dags.utils.tags import Tag, InvalidTagError
+
+from utils.tags import InvalidTagError, Tag
 
 
 @pytest.mark.parametrize(
-    "actual,expected",
+    ("actual", "expected"),
     [
         (Tag.ImpactTier.tier_1, "impact/tier_1"),
         (Tag.ImpactTier.tier_2, "impact/tier_2"),
         (Tag.ImpactTier.tier_3, "impact/tier_3"),
-    ]
+    ],
 )
 def test_valid_impact_tag(actual, expected):
     assert actual == expected
 
 
 @pytest.mark.parametrize(
-    "obj,attr,expected",
+    ("obj", "attr", "expected"),
     [
         (Tag.ImpactTier, "tier_1", "impact/tier_1"),
         (Tag.ImpactTier, "tier_2", "impact/tier_2"),
         (Tag.ImpactTier, "tier_3", "impact/tier_3"),
-    ]
+    ],
 )
-def test_valid_impact_tag(obj, attr, expected):
+def test_get_impact_tag(obj, attr, expected):
     assert getattr(obj, attr) == expected
 
 
@@ -32,8 +33,8 @@ def test_valid_impact_tag(obj, attr, expected):
         "tier_4",
         "",
         "bq-etl",
-    ]
+    ],
 )
-def test_valid_impact_tag(invalid_input):
+def test_invalid_impact_tag(invalid_input):
     with pytest.raises(InvalidTagError):
         getattr(Tag.ImpactTier, invalid_input)
diff --git a/dags/operators/__init__.py b/utils/__init__.py
similarity index 100%
rename from dags/operators/__init__.py
rename to utils/__init__.py
diff --git a/dags/utils/__init__.py b/utils/acoustic/__init__.py
similarity index 100%
rename from dags/utils/__init__.py
rename to utils/acoustic/__init__.py
diff --git a/dags/utils/acoustic/acoustic_client.py b/utils/acoustic/acoustic_client.py
similarity index 83%
rename from dags/utils/acoustic/acoustic_client.py
rename to utils/acoustic/acoustic_client.py
index 28c1bb148..eec18517c 100644
--- a/dags/utils/acoustic/acoustic_client.py
+++ b/utils/acoustic/acoustic_client.py
@@ -1,15 +1,14 @@
 """
-Module for authenticating into and interacting with Acoustic (XML) API
+Module for authenticating into and interacting with Acoustic (XML) API.
 
 Acoustic API docs can be found here: https://developer.goacoustic.com/acoustic-campaign/reference/overview
 """
 
+import logging
 from datetime import datetime
 from time import sleep
-import logging
 
 import requests
-
 import xmltodict  # type: ignore
 
 DATE_FORMAT = "%m/%d/%Y %H:%M:%S"
@@ -22,9 +21,7 @@ def _request_wrapper(request_method, request_body):
 
 
 class AcousticClient:
-    """
-    Acoustic Client object for authenticating into and interacting with Acoustic XML API.
-    """
+    """Acoustic Client object for authenticating into and interacting with Acoustic XML API."""
 
     DEFAULT_BASE_URL = "https://api-campaign-us-6.goacoustic.com"
     XML_API_ENDPOINT = "XMLAPI"
@@ -34,10 +31,12 @@ def __init__(
         client_id: str,
         client_secret: str,
         refresh_token: str,
-        base_url: str = None,
+        base_url: str | None = None,
         **kwargs,
     ):
         """
+        Initialize.
+
         :param client_id: to provide the client identity
         :param client_secret: secret that it used to confirm identity to the API
         :param refresh_token: A long-lived value that the client store,
@@ -54,12 +53,13 @@ def __init__(
 
     def _generate_access_token(self) -> str:
         """
-        Responsible for contacting Acoustic XML API and generating an access_token using Oauth method
+        Responsible for contacting Acoustic XML API and generating an access_token using Oauth method.
+
         to be used for interacting with the API and retrieving data in the following calls.
 
         :return: A short-lived token that can be generated based on the refresh token.
             A value that is ultimately passed to the API to prove that this client is authorized
-            to make API calls on the user’s behalf.
+            to make API calls on the user`s behalf.
 
         More info about Acoustic and Oauth:
         https://developer.goacoustic.com/acoustic-campaign/reference/overview#getting-started-with-oauth
@@ -92,9 +92,9 @@ def _generate_access_token(self) -> str:
 
         return response.json()["access_token"]
 
-    def _is_job_complete(self, job_id: int, extra_info: str = None) -> bool:
+    def _is_job_complete(self, job_id: int, extra_info: str | None = None) -> bool:
         """
-        Checks status of an Acoustic job to generate a report.
+        Check status of an Acoustic job to generate a report.
 
         :param job_id: Acoustic job id to check the progress status of
 
@@ -128,19 +128,21 @@ def _is_job_complete(self, job_id: int, extra_info: str = None) -> bool:
 
         job_status = data["Envelope"]["Body"]["RESULT"]["JOB_STATUS"].lower()
         print(
-            "INFO: Current status for Acoustic job_id: %s is %s (%s)"
-            % (job_id, job_status, extra_info or "")
+            "INFO: Current status for Acoustic job_id: {} is {} ({})".format(
+                job_id, job_status, extra_info or ""
+            )
         )
 
-        return "complete" == job_status
+        return job_status == "complete"
 
     def generate_report(
-        self, request_template: str, template_params: dict, report_type: str,
+        self,
+        request_template: str,
+        template_params: dict,
+        report_type: str,
     ) -> str:
         """
-        Extracts a listing of Acoustic Campaign emails that are sent for an organization
-        for a specified date range and provides metrics for those emails.
-
+        Extract a listing of Acoustic Campaign emails that are sent for an organization for a specified date range and provides metrics for those emails.
 
         :param request_template: path to XML template file containing request body template to be used
         :param template_params: values that should be used to render the request template provided
@@ -148,7 +150,10 @@ def generate_report(
         :return: Returns back a list of rows of data returned by the API call as a dictionary
         """
 
-        supported_report_types = ("raw_recipient_export", "contact_export",)
+        supported_report_types = (
+            "raw_recipient_export",
+            "contact_export",
+        )
         if report_type not in supported_report_types:
             err_msg = f"{report_type} is not a valid option, supported types are: {supported_report_types}"
             raise AttributeError(err_msg)
@@ -182,6 +187,8 @@ def generate_report(
         while not self._is_job_complete(job_id=job_id, extra_info=report_type):
             sleep(sleep_delay)
 
-        logging.info(f"{report_type} generation complete. Report location: {report_loc}. Time taken: {datetime.now() - start}")
+        logging.info(
+            f"{report_type} generation complete. Report location: {report_loc}. Time taken: {datetime.now() - start}"
+        )
 
         return
diff --git a/dags/utils/backfill.py b/utils/backfill.py
similarity index 100%
rename from dags/utils/backfill.py
rename to utils/backfill.py
diff --git a/dags/utils/callbacks.py b/utils/callbacks.py
similarity index 65%
rename from dags/utils/callbacks.py
rename to utils/callbacks.py
index 18180e522..25e65c5a0 100644
--- a/dags/utils/callbacks.py
+++ b/utils/callbacks.py
@@ -1,22 +1,25 @@
-from typing import Optional
+from typing import TYPE_CHECKING
 
-from airflow.models.dagrun import DagRun
 from airflow.models.taskinstance import clear_task_instances
 from airflow.utils.context import Context
 from airflow.utils.db import provide_session
 from sqlalchemy.orm.session import Session
 
+if TYPE_CHECKING:
+    from airflow.models.dagrun import DagRun
+
 
 @provide_session
-def retry_tasks_callback(context: Context, session: Optional[Session] = None) -> None:
-    """Callback to clear tasks specified by the `retry_tasks` task param.
-    
+def retry_tasks_callback(context: Context, session: Session | None = None) -> None:
+    """
+    Clear tasks specified by the `retry_tasks` task param.
+
     Intended to be used to as an `on_retry_callback` to also retry other tasks when a task fails.
     """
-    retry_task_ids: list[str] = context['params'].get('retry_tasks', [])
+    retry_task_ids: list[str] = context["params"].get("retry_tasks", [])
     if isinstance(retry_task_ids, str):
         retry_task_ids = [retry_task_ids]
-    dag_run: DagRun = context['dag_run']
+    dag_run: DagRun = context["dag_run"]
     retry_task_instances = [
         task_instance
         for task_instance in dag_run.get_task_instances(session=session)
diff --git a/utils/constants.py b/utils/constants.py
new file mode 100644
index 000000000..693bf2b5c
--- /dev/null
+++ b/utils/constants.py
@@ -0,0 +1,11 @@
+DS_WEEKLY = (
+    "{% if dag_run.external_trigger %}"
+    "{{ ds_nodash }}"
+    "{% else %}"
+    '{{ macros.ds_format(macros.ds_add(ds, 6), "%Y-%m-%d", "%Y%m%d") }}'
+    "{% endif %}"
+)
+
+FAILED_STATES = ["failed", "upstream_failed", "skipped"]
+
+ALLOWED_STATES = ["success"]
diff --git a/dags/utils/dataproc.py b/utils/dataproc.py
similarity index 54%
rename from dags/utils/dataproc.py
rename to utils/dataproc.py
index 981350f8f..94373a63b 100644
--- a/dags/utils/dataproc.py
+++ b/utils/dataproc.py
@@ -1,4 +1,3 @@
-import json
 import os
 from collections import namedtuple
 
@@ -19,43 +18,44 @@
     DataprocSubmitSparkJobOperator,
 )
 
-class DataProcHelper:
-    """
-    This is a helper class for creating/deleting dataproc clusters.
-    """
-
-    def __init__(self,
-                 cluster_name=None,
-                 job_name=None,
-                 num_workers=2,
-                 image_version='1.4-debian10',
-                 region='us-west1',
-                 subnetwork_uri=None,
-                 internal_ip_only=None,
-                 idle_delete_ttl=14400,
-                 auto_delete_ttl=28800,
-                 master_machine_type='n1-standard-8',
-                 worker_machine_type='n1-standard-4',
-                 num_preemptible_workers=0,
-                 service_account='dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com',
-                 init_actions_uris=None,
-                 additional_metadata=None,
-                 additional_properties=None,
-                 optional_components=['ANACONDA'],
-                 install_component_gateway=True,
-                 aws_conn_id=None,
-                 gcp_conn_id='google_cloud_airflow_dataproc',
-                 project_id='airflow-dataproc',
-                 artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts',
-                 storage_bucket='moz-fx-data-prod-dataproc-scratch',
-                 master_disk_type='pd-standard',
-                 master_disk_size=1024,
-                 master_num_local_ssds=0,
-                 worker_disk_type='pd-standard',
-                 worker_disk_size=1024,
-                 worker_num_local_ssds=0,
-                ):
 
+class DataProcHelper:
+    """Helper class for creating/deleting dataproc clusters."""
+
+    def __init__(
+        self,
+        cluster_name=None,
+        job_name=None,
+        num_workers=2,
+        image_version="1.4-debian10",
+        region="us-west1",
+        subnetwork_uri=None,
+        internal_ip_only=None,
+        idle_delete_ttl=14400,
+        auto_delete_ttl=28800,
+        master_machine_type="n1-standard-8",
+        worker_machine_type="n1-standard-4",
+        num_preemptible_workers=0,
+        service_account="dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com",
+        init_actions_uris=None,
+        additional_metadata=None,
+        additional_properties=None,
+        optional_components=None,
+        install_component_gateway=True,
+        aws_conn_id=None,
+        gcp_conn_id="google_cloud_airflow_dataproc",
+        project_id="airflow-dataproc",
+        artifact_bucket="moz-fx-data-prod-airflow-dataproc-artifacts",
+        storage_bucket="moz-fx-data-prod-dataproc-scratch",
+        master_disk_type="pd-standard",
+        master_disk_size=1024,
+        master_num_local_ssds=0,
+        worker_disk_type="pd-standard",
+        worker_disk_size=1024,
+        worker_num_local_ssds=0,
+    ):
+        if optional_components is None:
+            optional_components = ["ANACONDA"]
         self.cluster_name = cluster_name
         self.job_name = job_name
         self.num_workers = num_workers
@@ -82,9 +82,11 @@ def __init__(self,
         self.worker_num_local_ssds = worker_num_local_ssds
 
         if init_actions_uris is None:
-            self.init_actions_uris=['gs://{}/bootstrap/dataproc_init.sh'.format(self.artifact_bucket)]
+            self.init_actions_uris = [
+                f"gs://{self.artifact_bucket}/bootstrap/dataproc_init.sh"
+            ]
         else:
-            self.init_actions_uris=init_actions_uris
+            self.init_actions_uris = init_actions_uris
 
         if additional_metadata is None:
             self.additional_metadata = {}
@@ -103,9 +105,7 @@ def __init__(self,
         self.project_id = project_id
 
     def create_cluster(self):
-        """
-        Returns a DataprocCreateClusterOperator
-        """
+        """Return a DataprocCreateClusterOperator."""
         properties = {}
 
         # Google cloud storage requires object.create permission when reading from pyspark
@@ -115,7 +115,9 @@ def create_cluster(self):
         if self.aws_conn_id:
             for key, value in zip(
                 ("access.key", "secret.key", "session.token"),
-                AwsBaseHook(aws_conn_id=self.aws_conn_id, client_type='s3').get_credentials(),
+                AwsBaseHook(
+                    aws_conn_id=self.aws_conn_id, client_type="s3"
+                ).get_credentials(),
             ):
                 if value is not None:
                     properties["core:fs.s3a." + key] = value
@@ -128,22 +130,22 @@ def create_cluster(self):
         properties.update(self.additional_properties)
 
         metadata = {
-                'gcs-connector-version': '1.9.16',
-                'bigquery-connector-version': '0.13.6'
-            }
+            "gcs-connector-version": "1.9.16",
+            "bigquery-connector-version": "0.13.6",
+        }
         metadata.update(self.additional_metadata)
 
         cluster_generator = ClusterGenerator(
-            project_id = self.project_id,
-            num_workers = self.num_workers,
-            subnetwork_uri = self.subnetwork_uri,
-            internal_ip_only = self.internal_ip_only,
+            project_id=self.project_id,
+            num_workers=self.num_workers,
+            subnetwork_uri=self.subnetwork_uri,
+            internal_ip_only=self.internal_ip_only,
             storage_bucket=self.storage_bucket,
             init_actions_uris=self.init_actions_uris,
-            metadata = metadata,
+            metadata=metadata,
             image_version=self.image_version,
-            properties = properties,
-            optional_components = self.optional_components,
+            properties=properties,
+            optional_components=self.optional_components,
             master_machine_type=self.master_machine_type,
             master_disk_type=self.master_disk_type,
             master_disk_size=self.master_disk_size,
@@ -153,7 +155,7 @@ def create_cluster(self):
             num_preemptible_workers=self.num_preemptible_workers,
             service_account=self.service_account,
             idle_delete_ttl=self.idle_delete_ttl,
-            auto_delete_ttl=self.auto_delete_ttl
+            auto_delete_ttl=self.auto_delete_ttl,
         )
 
         cluster_config = cluster_generator.make()
@@ -162,83 +164,93 @@ def create_cluster(self):
         # ClusterConfig format is
         # https://cloud.google.com/dataproc/docs/reference/rpc/google.cloud.dataproc.v1#google.cloud.dataproc.v1.ClusterConfig
         if self.install_component_gateway:
-            cluster_config.update({'endpoint_config' : {'enable_http_port_access' : True}})
+            cluster_config.update(
+                {"endpoint_config": {"enable_http_port_access": True}}
+            )
 
         if self.master_num_local_ssds > 0:
-            master_instance_group_config = cluster_config['master_config']
-            master_instance_group_config['disk_config']['num_local_ssds'] = self.master_num_local_ssds
-            cluster_config.update({'master_config' : master_instance_group_config})
+            master_instance_group_config = cluster_config["master_config"]
+            master_instance_group_config["disk_config"][
+                "num_local_ssds"
+            ] = self.master_num_local_ssds
+            cluster_config.update({"master_config": master_instance_group_config})
 
         if self.worker_num_local_ssds > 0:
-            worker_instance_group_config = cluster_config['worker_config']
-            worker_instance_group_config['disk_config']['num_local_ssds'] =self.worker_num_local_ssds
-            cluster_config.update({'worker_config' : worker_instance_group_config})
+            worker_instance_group_config = cluster_config["worker_config"]
+            worker_instance_group_config["disk_config"][
+                "num_local_ssds"
+            ] = self.worker_num_local_ssds
+            cluster_config.update({"worker_config": worker_instance_group_config})
 
         return DataprocCreateClusterOperator(
-            task_id='create_dataproc_cluster',
+            task_id="create_dataproc_cluster",
             cluster_name=self.cluster_name,
-            project_id = self.project_id,
+            project_id=self.project_id,
             use_if_exists=True,
             delete_on_error=True,
-            labels={ 'env': os.getenv('DEPLOY_ENVIRONMENT', 'env_not_set'),
-                     'owner': os.getenv('AIRFLOW_CTX_DAG_OWNER', 'owner_not_set'),
-                     'jobname': self.job_name.lower().replace('_', '-') },
+            labels={
+                "env": os.getenv("DEPLOY_ENVIRONMENT", "env_not_set"),
+                "owner": os.getenv("AIRFLOW_CTX_DAG_OWNER", "owner_not_set"),
+                "jobname": self.job_name.lower().replace("_", "-"),
+            },
             gcp_conn_id=self.gcp_conn_id,
             region=self.region,
-            cluster_config = cluster_config
+            cluster_config=cluster_config,
         )
 
     def delete_cluster(self):
-        """
-        Returns a DataprocDeleteClusterOperator
-        """
+        """Return a DataprocDeleteClusterOperator."""
         return DataprocDeleteClusterOperator(
-            task_id='delete_dataproc_cluster',
+            task_id="delete_dataproc_cluster",
             cluster_name=self.cluster_name,
             region=self.region,
             gcp_conn_id=self.gcp_conn_id,
-            project_id=self.project_id)
-# End DataProcHelper
+            project_id=self.project_id,
+        )
 
 
-def moz_dataproc_pyspark_runner(parent_dag_name=None,
-                                dag_name='run_pyspark_on_dataproc',
-                                default_args=None,
-                                cluster_name=None,
-                                num_workers=2,
-                                image_version='1.4-debian10',
-                                region='us-west1',
-                                subnetwork_uri=None,
-                                internal_ip_only=None,
-                                idle_delete_ttl=10800,
-                                auto_delete_ttl=21600,
-                                master_machine_type='n1-standard-8',
-                                worker_machine_type='n1-standard-4',
-                                num_preemptible_workers=0,
-                                service_account='dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com',
-                                init_actions_uris=None,
-                                additional_metadata=None,
-                                additional_properties=None,
-                                optional_components=['ANACONDA'],
-                                install_component_gateway=True,
-                                python_driver_code=None,
-                                py_args=None,
-                                job_name=None,
-                                aws_conn_id=None,
-                                gcp_conn_id='google_cloud_airflow_dataproc',
-                                project_id='airflow-dataproc',
-                                artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts',
-                                storage_bucket='moz-fx-data-prod-dataproc-scratch',
-                                master_disk_type='pd-standard',
-                                worker_disk_type='pd-standard',
-                                master_disk_size=1024,
-                                worker_disk_size=1024,
-                                master_num_local_ssds=0,
-                                worker_num_local_ssds=0,
-                            ):
+# End DataProcHelper
+
 
+def moz_dataproc_pyspark_runner(
+    parent_dag_name=None,
+    dag_name="run_pyspark_on_dataproc",
+    default_args=None,
+    cluster_name=None,
+    num_workers=2,
+    image_version="1.4-debian10",
+    region="us-west1",
+    subnetwork_uri=None,
+    internal_ip_only=None,
+    idle_delete_ttl=10800,
+    auto_delete_ttl=21600,
+    master_machine_type="n1-standard-8",
+    worker_machine_type="n1-standard-4",
+    num_preemptible_workers=0,
+    service_account="dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com",
+    init_actions_uris=None,
+    additional_metadata=None,
+    additional_properties=None,
+    optional_components=None,
+    install_component_gateway=True,
+    python_driver_code=None,
+    py_args=None,
+    job_name=None,
+    aws_conn_id=None,
+    gcp_conn_id="google_cloud_airflow_dataproc",
+    project_id="airflow-dataproc",
+    artifact_bucket="moz-fx-data-prod-airflow-dataproc-artifacts",
+    storage_bucket="moz-fx-data-prod-dataproc-scratch",
+    master_disk_type="pd-standard",
+    worker_disk_type="pd-standard",
+    master_disk_size=1024,
+    worker_disk_size=1024,
+    master_num_local_ssds=0,
+    worker_num_local_ssds=0,
+):
     """
-    This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
+    Create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
+
     Then we call DataprocSubmitPySparkJobOperator to execute the pyspark script defined by the argument
     python_driver_code. Once that succeeds, we teardown the cluster.
 
@@ -342,98 +354,104 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None,
 
     """
 
+    if optional_components is None:
+        optional_components = ["ANACONDA"]
     if cluster_name is None or python_driver_code is None:
-        raise AirflowException('Please specify cluster_name and/or python_driver_code.')
-
-    dataproc_helper = DataProcHelper(cluster_name=cluster_name,
-                                     job_name=job_name,
-                                     num_workers=num_workers,
-                                     image_version=image_version,
-                                     region=region,
-                                     subnetwork_uri=subnetwork_uri,
-                                     internal_ip_only=internal_ip_only,
-                                     idle_delete_ttl=idle_delete_ttl,
-                                     auto_delete_ttl=auto_delete_ttl,
-                                     master_machine_type=master_machine_type,
-                                     worker_machine_type=worker_machine_type,
-                                     num_preemptible_workers=num_preemptible_workers,
-                                     service_account=service_account,
-                                     init_actions_uris=init_actions_uris,
-                                     optional_components=optional_components,
-                                     additional_metadata=additional_metadata,
-                                     additional_properties=additional_properties,
-                                     install_component_gateway=install_component_gateway,
-                                     aws_conn_id=aws_conn_id,
-                                     gcp_conn_id=gcp_conn_id,
-                                     project_id=project_id,
-                                     artifact_bucket=artifact_bucket,
-                                     storage_bucket=storage_bucket,
-                                     master_disk_type=master_disk_type,
-                                     master_disk_size=master_disk_size,
-                                     worker_disk_type=worker_disk_type,
-                                     worker_disk_size=worker_disk_size,
-                                     master_num_local_ssds=master_num_local_ssds,
-                                     worker_num_local_ssds=worker_num_local_ssds,
-                                     )
-
-    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)
+        raise AirflowException("Please specify cluster_name and/or python_driver_code.")
+
+    dataproc_helper = DataProcHelper(
+        cluster_name=cluster_name,
+        job_name=job_name,
+        num_workers=num_workers,
+        image_version=image_version,
+        region=region,
+        subnetwork_uri=subnetwork_uri,
+        internal_ip_only=internal_ip_only,
+        idle_delete_ttl=idle_delete_ttl,
+        auto_delete_ttl=auto_delete_ttl,
+        master_machine_type=master_machine_type,
+        worker_machine_type=worker_machine_type,
+        num_preemptible_workers=num_preemptible_workers,
+        service_account=service_account,
+        init_actions_uris=init_actions_uris,
+        optional_components=optional_components,
+        additional_metadata=additional_metadata,
+        additional_properties=additional_properties,
+        install_component_gateway=install_component_gateway,
+        aws_conn_id=aws_conn_id,
+        gcp_conn_id=gcp_conn_id,
+        project_id=project_id,
+        artifact_bucket=artifact_bucket,
+        storage_bucket=storage_bucket,
+        master_disk_type=master_disk_type,
+        master_disk_size=master_disk_size,
+        worker_disk_type=worker_disk_type,
+        worker_disk_size=worker_disk_size,
+        master_num_local_ssds=master_num_local_ssds,
+        worker_num_local_ssds=worker_num_local_ssds,
+    )
+
+    _dag_name = f"{parent_dag_name}.{dag_name}"
 
     with models.DAG(_dag_name, default_args=default_args) as dag:
         create_dataproc_cluster = dataproc_helper.create_cluster()
 
         run_pyspark_on_dataproc = DataprocSubmitPySparkJobOperator(
-            task_id='run_dataproc_pyspark',
+            task_id="run_dataproc_pyspark",
             job_name=job_name,
             cluster_name=cluster_name,
             region=region,
             main=python_driver_code,
             arguments=py_args,
             gcp_conn_id=gcp_conn_id,
-            project_id=project_id
+            project_id=project_id,
         )
 
         delete_dataproc_cluster = dataproc_helper.delete_cluster()
 
         create_dataproc_cluster >> run_pyspark_on_dataproc >> delete_dataproc_cluster
         return dag
-# End moz_dataproc_pyspark_runner
 
 
-def moz_dataproc_jar_runner(parent_dag_name=None,
-                            dag_name='run_script_on_dataproc',
-                            default_args=None,
-                            cluster_name=None,
-                            num_workers=2,
-                            image_version='1.4-debian10',
-                            region='us-west1',
-                            subnetwork_uri=None,
-                            internal_ip_only=None,
-                            idle_delete_ttl=14400,
-                            auto_delete_ttl=28800,
-                            master_machine_type='n1-standard-8',
-                            worker_machine_type='n1-standard-4',
-                            num_preemptible_workers=0,
-                            service_account='dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com',
-                            init_actions_uris=None,
-                            optional_components=['ANACONDA'],
-                            install_component_gateway=True,
-                            jar_urls=None,
-                            main_class=None,
-                            jar_args=None,
-                            job_name=None,
-                            aws_conn_id=None,
-                            gcp_conn_id='google_cloud_airflow_dataproc',
-                            project_id='airflow-dataproc',
-                            master_disk_type='pd-standard',
-                            worker_disk_type='pd-standard',
-                            master_disk_size=1024,
-                            worker_disk_size=1024,
-                            master_num_local_ssds=0,
-                            worker_num_local_ssds=0,
-                            ):
+# End moz_dataproc_pyspark_runner
+
 
+def moz_dataproc_jar_runner(
+    parent_dag_name=None,
+    dag_name="run_script_on_dataproc",
+    default_args=None,
+    cluster_name=None,
+    num_workers=2,
+    image_version="1.4-debian10",
+    region="us-west1",
+    subnetwork_uri=None,
+    internal_ip_only=None,
+    idle_delete_ttl=14400,
+    auto_delete_ttl=28800,
+    master_machine_type="n1-standard-8",
+    worker_machine_type="n1-standard-4",
+    num_preemptible_workers=0,
+    service_account="dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com",
+    init_actions_uris=None,
+    optional_components=None,
+    install_component_gateway=True,
+    jar_urls=None,
+    main_class=None,
+    jar_args=None,
+    job_name=None,
+    aws_conn_id=None,
+    gcp_conn_id="google_cloud_airflow_dataproc",
+    project_id="airflow-dataproc",
+    master_disk_type="pd-standard",
+    worker_disk_type="pd-standard",
+    master_disk_size=1024,
+    worker_disk_size=1024,
+    master_num_local_ssds=0,
+    worker_num_local_ssds=0,
+):
     """
-    This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
+    Create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
+
     Then we call DataprocSubmitSparkJobOperator to execute the jar defined by the arguments
     jar_urls and main_class. Once that succeeds, we teardown the cluster.
 
@@ -477,37 +495,42 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
 
     """
 
+    if optional_components is None:
+        optional_components = ["ANACONDA"]
     if cluster_name is None or jar_urls is None or main_class is None:
-        raise AirflowException('Please specify cluster_name, jar_urls, and/or main_class.')
-
-    dataproc_helper = DataProcHelper(cluster_name=cluster_name,
-                                     job_name=job_name,
-                                     num_workers=num_workers,
-                                     image_version=image_version,
-                                     region=region,
-                                     subnetwork_uri=subnetwork_uri,
-                                     internal_ip_only=internal_ip_only,
-                                     idle_delete_ttl=idle_delete_ttl,
-                                     auto_delete_ttl=auto_delete_ttl,
-                                     master_machine_type=master_machine_type,
-                                     worker_machine_type=worker_machine_type,
-                                     num_preemptible_workers=num_preemptible_workers,
-                                     service_account=service_account,
-                                     init_actions_uris=init_actions_uris,
-                                     optional_components=optional_components,
-                                     install_component_gateway=install_component_gateway,
-                                     aws_conn_id=aws_conn_id,
-                                     gcp_conn_id=gcp_conn_id,
-                                     project_id=project_id,
-                                     master_disk_type=master_disk_type,
-                                     master_disk_size=master_disk_size,
-                                     worker_disk_type=worker_disk_type,
-                                     worker_disk_size=worker_disk_size,
-                                     master_num_local_ssds=master_num_local_ssds,
-                                     worker_num_local_ssds=worker_num_local_ssds,
-                                     )
-
-    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)
+        raise AirflowException(
+            "Please specify cluster_name, jar_urls, and/or main_class."
+        )
+
+    dataproc_helper = DataProcHelper(
+        cluster_name=cluster_name,
+        job_name=job_name,
+        num_workers=num_workers,
+        image_version=image_version,
+        region=region,
+        subnetwork_uri=subnetwork_uri,
+        internal_ip_only=internal_ip_only,
+        idle_delete_ttl=idle_delete_ttl,
+        auto_delete_ttl=auto_delete_ttl,
+        master_machine_type=master_machine_type,
+        worker_machine_type=worker_machine_type,
+        num_preemptible_workers=num_preemptible_workers,
+        service_account=service_account,
+        init_actions_uris=init_actions_uris,
+        optional_components=optional_components,
+        install_component_gateway=install_component_gateway,
+        aws_conn_id=aws_conn_id,
+        gcp_conn_id=gcp_conn_id,
+        project_id=project_id,
+        master_disk_type=master_disk_type,
+        master_disk_size=master_disk_size,
+        worker_disk_type=worker_disk_type,
+        worker_disk_size=worker_disk_size,
+        master_num_local_ssds=master_num_local_ssds,
+        worker_num_local_ssds=worker_num_local_ssds,
+    )
+
+    _dag_name = f"{parent_dag_name}.{dag_name}"
 
     with models.DAG(_dag_name, default_args=default_args) as dag:
         create_dataproc_cluster = dataproc_helper.create_cluster()
@@ -515,62 +538,65 @@ def moz_dataproc_jar_runner(parent_dag_name=None,
         run_jar_on_dataproc = DataprocSubmitSparkJobOperator(
             cluster_name=cluster_name,
             region=region,
-            task_id='run_jar_on_dataproc',
+            task_id="run_jar_on_dataproc",
             job_name=job_name,
             dataproc_jars=jar_urls,
             main_class=main_class,
             arguments=jar_args,
             gcp_conn_id=gcp_conn_id,
-            project_id=project_id
+            project_id=project_id,
         )
 
         delete_dataproc_cluster = dataproc_helper.delete_cluster()
 
         create_dataproc_cluster >> run_jar_on_dataproc >> delete_dataproc_cluster
         return dag
+
+
 # End moz_dataproc_jar_runner
 
 
 def _format_envvar(env=None):
     # Use a default value if an environment dictionary isn't supplied
-    return ' '.join(['{}={}'.format(k, v) for k, v in (env or {}).items()])
-
-
-def moz_dataproc_scriptrunner(parent_dag_name=None,
-                              dag_name='run_script_on_dataproc',
-                              default_args=None,
-                              cluster_name=None,
-                              num_workers=2,
-                              image_version='1.4-debian10',
-                              region='us-west1',
-                              subnetwork_uri=None,
-                              internal_ip_only=None,
-                              idle_delete_ttl=14400,
-                              auto_delete_ttl=28800,
-                              master_machine_type='n1-standard-8',
-                              worker_machine_type='n1-standard-4',
-                              num_preemptible_workers=0,
-                              service_account='dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com',
-                              init_actions_uris=None,
-                              optional_components=['ANACONDA'],
-                              install_component_gateway=True,
-                              uri=None,
-                              env=None,
-                              arguments=None,
-                              job_name=None,
-                              aws_conn_id=None,
-                              gcp_conn_id='google_cloud_airflow_dataproc',
-                              project_id='airflow-dataproc',
-                              master_disk_type='pd-standard',
-                              worker_disk_type='pd-standard',
-                              master_disk_size=1024,
-                              worker_disk_size=1024,
-                              master_num_local_ssds=0,
-                              worker_num_local_ssds=0,
-                              ):
-
+    return " ".join([f"{k}={v}" for k, v in (env or {}).items()])
+
+
+def moz_dataproc_scriptrunner(
+    parent_dag_name=None,
+    dag_name="run_script_on_dataproc",
+    default_args=None,
+    cluster_name=None,
+    num_workers=2,
+    image_version="1.4-debian10",
+    region="us-west1",
+    subnetwork_uri=None,
+    internal_ip_only=None,
+    idle_delete_ttl=14400,
+    auto_delete_ttl=28800,
+    master_machine_type="n1-standard-8",
+    worker_machine_type="n1-standard-4",
+    num_preemptible_workers=0,
+    service_account="dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com",
+    init_actions_uris=None,
+    optional_components=None,
+    install_component_gateway=True,
+    uri=None,
+    env=None,
+    arguments=None,
+    job_name=None,
+    aws_conn_id=None,
+    gcp_conn_id="google_cloud_airflow_dataproc",
+    project_id="airflow-dataproc",
+    master_disk_type="pd-standard",
+    worker_disk_type="pd-standard",
+    master_disk_size=1024,
+    worker_disk_size=1024,
+    master_num_local_ssds=0,
+    worker_num_local_ssds=0,
+):
     """
-    This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
+    Create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
+
     Then we execute a script uri (either https or gcs) similar to how we use our custom AWS
     EmrSparkOperator. This will call DataprocSubmitSparkJobOperator using EMR's script-runner.jar, which
     then executes the airflow_gcp.sh entrypoint script. The entrypoint script expects another
@@ -622,51 +648,57 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
 
     """
 
+    if optional_components is None:
+        optional_components = ["ANACONDA"]
     if job_name is None or uri is None or cluster_name is None:
-        raise AirflowException('Please specify job_name, uri, and cluster_name.')
-
-    dataproc_helper = DataProcHelper(cluster_name=cluster_name,
-                                     job_name=job_name,
-                                     num_workers=num_workers,
-                                     image_version=image_version,
-                                     region=region,
-                                     subnetwork_uri=subnetwork_uri,
-                                     internal_ip_only=internal_ip_only,
-                                     idle_delete_ttl=idle_delete_ttl,
-                                     auto_delete_ttl=auto_delete_ttl,
-                                     master_machine_type=master_machine_type,
-                                     worker_machine_type=worker_machine_type,
-                                     num_preemptible_workers=num_preemptible_workers,
-                                     service_account=service_account,
-                                     init_actions_uris=init_actions_uris,
-                                     optional_components=optional_components,
-                                     install_component_gateway=install_component_gateway,
-                                     aws_conn_id=aws_conn_id,
-                                     gcp_conn_id=gcp_conn_id,
-                                     project_id=project_id,
-                                     master_disk_type=master_disk_type,
-                                     master_disk_size=master_disk_size,
-                                     worker_disk_type=worker_disk_type,
-                                     worker_disk_size=worker_disk_size,
-                                     master_num_local_ssds=master_num_local_ssds,
-                                     worker_num_local_ssds=worker_num_local_ssds,
-                                     )
-
-    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)
+        raise AirflowException("Please specify job_name, uri, and cluster_name.")
+
+    dataproc_helper = DataProcHelper(
+        cluster_name=cluster_name,
+        job_name=job_name,
+        num_workers=num_workers,
+        image_version=image_version,
+        region=region,
+        subnetwork_uri=subnetwork_uri,
+        internal_ip_only=internal_ip_only,
+        idle_delete_ttl=idle_delete_ttl,
+        auto_delete_ttl=auto_delete_ttl,
+        master_machine_type=master_machine_type,
+        worker_machine_type=worker_machine_type,
+        num_preemptible_workers=num_preemptible_workers,
+        service_account=service_account,
+        init_actions_uris=init_actions_uris,
+        optional_components=optional_components,
+        install_component_gateway=install_component_gateway,
+        aws_conn_id=aws_conn_id,
+        gcp_conn_id=gcp_conn_id,
+        project_id=project_id,
+        master_disk_type=master_disk_type,
+        master_disk_size=master_disk_size,
+        worker_disk_type=worker_disk_type,
+        worker_disk_size=worker_disk_size,
+        master_num_local_ssds=master_num_local_ssds,
+        worker_num_local_ssds=worker_num_local_ssds,
+    )
+
+    _dag_name = f"{parent_dag_name}.{dag_name}"
     environment = _format_envvar(env)
 
-    script_bucket = 'moz-fx-data-prod-airflow-dataproc-artifacts'
-    jar_url = 'gs://{}/bin/script-runner.jar'.format(script_bucket)
+    script_bucket = "moz-fx-data-prod-airflow-dataproc-artifacts"
+    jar_url = f"gs://{script_bucket}/bin/script-runner.jar"
 
     args = [
-        'gs://{}/bootstrap/airflow_gcp.sh'.format(script_bucket),
-        '--job-name', job_name,
-        '--uri', uri,
-        '--environment', environment
+        f"gs://{script_bucket}/bootstrap/airflow_gcp.sh",
+        "--job-name",
+        job_name,
+        "--uri",
+        uri,
+        "--environment",
+        environment,
     ]
 
     if arguments:
-        args += ['--arguments', arguments]
+        args += ["--arguments", arguments]
 
     with models.DAG(_dag_name, default_args=default_args) as dag:
         create_dataproc_cluster = dataproc_helper.create_cluster()
@@ -676,24 +708,27 @@ def moz_dataproc_scriptrunner(parent_dag_name=None,
         run_script_on_dataproc = DataprocSubmitSparkJobOperator(
             cluster_name=cluster_name,
             region=region,
-            task_id='run_script_on_dataproc',
+            task_id="run_script_on_dataproc",
             job_name=job_name,
             dataproc_jars=[jar_url],
-            main_class='com.amazon.elasticmapreduce.scriptrunner.ScriptRunner',
+            main_class="com.amazon.elasticmapreduce.scriptrunner.ScriptRunner",
             arguments=args,
             gcp_conn_id=gcp_conn_id,
-            project_id=project_id
+            project_id=project_id,
         )
 
         delete_dataproc_cluster = dataproc_helper.delete_cluster()
 
         create_dataproc_cluster >> run_script_on_dataproc >> delete_dataproc_cluster
         return dag
+
+
 # End moz_dataproc_scriptrunner
 
 
 def copy_artifacts_dev(dag, project_id, artifact_bucket, storage_bucket):
-    """Bootstrap a dataproc job for local testing.
+    """
+    Bootstrap a dataproc job for local testing.
 
     This job requires setting GOOGLE_APPLICATION_CREDENTIALS before starting the
     airflow container. It will copy the contents of the local jobs and
@@ -747,14 +782,16 @@ def copy_artifacts_dev(dag, project_id, artifact_bucket, storage_bucket):
 
 
 def get_dataproc_parameters(conn_id="google_cloud_airflow_dataproc"):
-    """This function can be used to gather parameters that correspond to development
-    parameters. The provided connection string should be a Google Cloud connection
+    """
+    Can be used to gather parameters that correspond to development parameters.
+
+    The provided connection string should be a Google Cloud connection
     and should either be the production default ("dataproc-runner-prod"), or a
     service key associated with a sandbox account.
     """
     dev_project_id = "replace_me"
     dev_client_email = "replace_me"
-    
+
     is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev"
     project_id = "airflow-dataproc" if is_dev else dev_project_id
     client_email = (
@@ -763,18 +800,16 @@ def get_dataproc_parameters(conn_id="google_cloud_airflow_dataproc"):
         else "dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com"
     )
     artifact_bucket = (
-        "{}-dataproc-artifacts".format(project_id)
+        f"{project_id}-dataproc-artifacts"
         if is_dev
         else "moz-fx-data-prod-airflow-dataproc-artifacts"
     )
     storage_bucket = (
-        "{}-dataproc-scratch".format(project_id)
+        f"{project_id}-dataproc-scratch"
         if is_dev
         else "moz-fx-data-prod-dataproc-scratch"
     )
-    output_bucket = (
-        artifact_bucket if is_dev else "airflow-dataproc-bq-parquet-exports"
-    )
+    output_bucket = artifact_bucket if is_dev else "airflow-dataproc-bq-parquet-exports"
     return DataprocParameters(
         conn_id,
         project_id,
@@ -782,5 +817,5 @@ def get_dataproc_parameters(conn_id="google_cloud_airflow_dataproc"):
         client_email,
         artifact_bucket,
         storage_bucket,
-        output_bucket
+        output_bucket,
     )
diff --git a/dags/utils/gcp.py b/utils/gcp.py
similarity index 99%
rename from dags/utils/gcp.py
rename to utils/gcp.py
index 28a5af50e..ae6192365 100644
--- a/dags/utils/gcp.py
+++ b/utils/gcp.py
@@ -14,8 +14,8 @@
     BigQueryToGCSOperator,
 )
 
-from dags.operators.gcp_container_operator import GKEPodOperator
-from dags.utils.dataproc import get_dataproc_parameters
+from operators.gcp_container_operator import GKEPodOperator
+from utils.dataproc import get_dataproc_parameters
 
 GCP_PROJECT_ID = "moz-fx-data-airflow-gke-prod"
 DATAPROC_PROJECT_ID = "airflow-dataproc"
diff --git a/dags/utils/gke.py b/utils/gke.py
similarity index 97%
rename from dags/utils/gke.py
rename to utils/gke.py
index 4ee451ffa..94344ad57 100644
--- a/dags/utils/gke.py
+++ b/utils/gke.py
@@ -11,10 +11,8 @@ def create_gke_config(
     subnetwork="default",
     is_dev=False,
 ):
-
     """
-    Helper function to create gke cluster definition dict. All fields must match
-    their protobuf definitions.
+    Create gke cluster definition dict. All fields must match their protobuf definitions.
 
     See:
         https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1beta1/projects.locations.clusters#Cluster
diff --git a/dags/utils/acoustic/__init__.py b/utils/glam_subdags/__init__.py
similarity index 100%
rename from dags/utils/acoustic/__init__.py
rename to utils/glam_subdags/__init__.py
diff --git a/dags/glam_subdags/extract.py b/utils/glam_subdags/extract.py
similarity index 99%
rename from dags/glam_subdags/extract.py
rename to utils/glam_subdags/extract.py
index fbe3145b6..c7e9b0838 100644
--- a/dags/glam_subdags/extract.py
+++ b/utils/glam_subdags/extract.py
@@ -4,6 +4,7 @@
 from airflow.providers.google.cloud.transfers.bigquery_to_gcs import (
     BigQueryToGCSOperator,
 )
+
 from utils.gcp import bigquery_etl_query
 
 gcp_conn_id = "google_cloud_airflow_dataproc"
diff --git a/dags/glam_subdags/general.py b/utils/glam_subdags/general.py
similarity index 99%
rename from dags/glam_subdags/general.py
rename to utils/glam_subdags/general.py
index 743b2e62e..eaab4d43a 100644
--- a/dags/glam_subdags/general.py
+++ b/utils/glam_subdags/general.py
@@ -1,4 +1,5 @@
 from airflow.models import DAG
+
 from utils.gcp import bigquery_etl_query
 
 
diff --git a/dags/glam_subdags/generate_query.py b/utils/glam_subdags/generate_query.py
similarity index 100%
rename from dags/glam_subdags/generate_query.py
rename to utils/glam_subdags/generate_query.py
diff --git a/dags/glam_subdags/histograms.py b/utils/glam_subdags/histograms.py
similarity index 99%
rename from dags/glam_subdags/histograms.py
rename to utils/glam_subdags/histograms.py
index 40aa4009d..d84cc7882 100644
--- a/dags/glam_subdags/histograms.py
+++ b/utils/glam_subdags/histograms.py
@@ -1,4 +1,5 @@
 from airflow.models import DAG
+
 from utils.gcp import bigquery_etl_query
 
 GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG = "clients_histogram_aggregates"
diff --git a/dags/glam_subdags/probe_hotlist.py b/utils/glam_subdags/probe_hotlist.py
similarity index 94%
rename from dags/glam_subdags/probe_hotlist.py
rename to utils/glam_subdags/probe_hotlist.py
index e7896bd38..f083fb99e 100644
--- a/dags/glam_subdags/probe_hotlist.py
+++ b/utils/glam_subdags/probe_hotlist.py
@@ -11,6 +11,8 @@ def update_hotlist(
     **kwargs,
 ):
     """
+    Update hotlist.
+
     :param task_id:                     Airflow task id
     :param project_id:                  GCP project to write to
     :param source_dataset_id:           Bigquery dataset to read from in queries
@@ -26,9 +28,7 @@ def update_hotlist(
         "DATASET": destination_dataset_id,
         "SUBMISSION_DATE": "{{ ds }}",
     }
-    command = [
-        "script/glam/update_probe_hotlist"
-    ]
+    command = ["script/glam/update_probe_hotlist"]
     return gke_command(
         task_id=task_id,
         cmds=["bash"],
diff --git a/dags/utils/patched/__init__.py b/utils/patched/__init__.py
similarity index 100%
rename from dags/utils/patched/__init__.py
rename to utils/patched/__init__.py
diff --git a/dags/utils/patched/dataproc_hook.py b/utils/patched/dataproc_hook.py
similarity index 79%
rename from dags/utils/patched/dataproc_hook.py
rename to utils/patched/dataproc_hook.py
index e72caf292..70540ad69 100644
--- a/dags/utils/patched/dataproc_hook.py
+++ b/utils/patched/dataproc_hook.py
@@ -16,9 +16,9 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-"""This module contains a Google Cloud Dataproc hook."""
-
 """
+Contains a Google Cloud Dataproc hook.
+
 I have copy pasted this from the providers-google/5.0.0 branch
 https://github.com/apache/airflow/blob/providers-google/5.0.0/airflow/providers/google/cloud/hooks/dataproc.py
 
@@ -34,10 +34,14 @@
 import time
 import uuid
 import warnings
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+from collections.abc import Iterable, Sequence
 
+from airflow.exceptions import AirflowException
+from airflow.providers.google.common.hooks.base_google import GoogleBaseHook
+from airflow.version import version as airflow_version
 from google.api_core.exceptions import ServerError
 from google.api_core.retry import Retry
+
 # Moz specific - import has changed
 from google.cloud.dataproc_v1 import (
     Cluster,
@@ -51,10 +55,6 @@
 from google.protobuf.duration_pb2 import Duration
 from google.protobuf.field_mask_pb2 import FieldMask
 
-from airflow.exceptions import AirflowException
-from airflow.providers.google.common.hooks.base_google import GoogleBaseHook
-from airflow.version import version as airflow_version
-
 
 class DataProcJobBuilder:
     """A helper class for building Dataproc job."""
@@ -65,7 +65,7 @@ def __init__(
         task_id: str,
         cluster_name: str,
         job_type: str,
-        properties: Optional[Dict[str, str]] = None,
+        properties: dict[str, str] | None = None,
     ) -> None:
         name = task_id + "_" + str(uuid.uuid4())[:8]
         self.job_type = job_type
@@ -73,7 +73,10 @@ def __init__(
             "job": {
                 "reference": {"project_id": project_id, "job_id": name},
                 "placement": {"cluster_name": cluster_name},
-                "labels": {'airflow-version': 'v' + airflow_version.replace('.', '-').replace('+', '-')},
+                "labels": {
+                    "airflow-version": "v"
+                    + airflow_version.replace(".", "-").replace("+", "-")
+                },
                 job_type: {},
             }
         }  # type: Dict[str, Any]
@@ -90,7 +93,7 @@ def add_labels(self, labels: dict) -> None:
         if labels:
             self.job["job"]["labels"].update(labels)
 
-    def add_variables(self, variables: List[str]) -> None:
+    def add_variables(self, variables: list[str]) -> None:
         """
         Set variables for Dataproc job.
 
@@ -100,7 +103,7 @@ def add_variables(self, variables: List[str]) -> None:
         if variables is not None:
             self.job["job"][self.job_type]["script_variables"] = variables
 
-    def add_args(self, args: List[str]) -> None:
+    def add_args(self, args: list[str]) -> None:
         """
         Set args for Dataproc job.
 
@@ -110,14 +113,14 @@ def add_args(self, args: List[str]) -> None:
         if args is not None:
             self.job["job"][self.job_type]["args"] = args
 
-    def add_query(self, query: List[str]) -> None:
+    def add_query(self, query: list[str]) -> None:
         """
         Set query uris for Dataproc job.
 
         :param query: URIs for the job queries.
         :type query: List[str]
         """
-        self.job["job"][self.job_type]["query_list"] = {'queries': [query]}
+        self.job["job"][self.job_type]["query_list"] = {"queries": [query]}
 
     def add_query_uri(self, query_uri: str) -> None:
         """
@@ -128,7 +131,7 @@ def add_query_uri(self, query_uri: str) -> None:
         """
         self.job["job"][self.job_type]["query_file_uri"] = query_uri
 
-    def add_jar_file_uris(self, jars: List[str]) -> None:
+    def add_jar_file_uris(self, jars: list[str]) -> None:
         """
         Set jars uris for Dataproc job.
 
@@ -138,7 +141,7 @@ def add_jar_file_uris(self, jars: List[str]) -> None:
         if jars is not None:
             self.job["job"][self.job_type]["jar_file_uris"] = jars
 
-    def add_archive_uris(self, archives: List[str]) -> None:
+    def add_archive_uris(self, archives: list[str]) -> None:
         """
         Set archives uris for Dataproc job.
 
@@ -148,7 +151,7 @@ def add_archive_uris(self, archives: List[str]) -> None:
         if archives is not None:
             self.job["job"][self.job_type]["archive_uris"] = archives
 
-    def add_file_uris(self, files: List[str]) -> None:
+    def add_file_uris(self, files: list[str]) -> None:
         """
         Set file uris for Dataproc job.
 
@@ -158,7 +161,7 @@ def add_file_uris(self, files: List[str]) -> None:
         if files is not None:
             self.job["job"][self.job_type]["file_uris"] = files
 
-    def add_python_file_uris(self, pyfiles: List[str]) -> None:
+    def add_python_file_uris(self, pyfiles: list[str]) -> None:
         """
         Set python file uris for Dataproc job.
 
@@ -168,7 +171,7 @@ def add_python_file_uris(self, pyfiles: List[str]) -> None:
         if pyfiles is not None:
             self.job["job"][self.job_type]["python_file_uris"] = pyfiles
 
-    def set_main(self, main_jar: Optional[str], main_class: Optional[str]) -> None:
+    def set_main(self, main_jar: str | None, main_class: str | None) -> None:
         """
         Set Dataproc main class.
 
@@ -203,9 +206,9 @@ def set_job_name(self, name: str) -> None:
         """
         self.job["job"]["reference"]["job_id"] = name + "_" + str(uuid.uuid4())[:8]
 
-    def build(self) -> Dict:
+    def build(self) -> dict:
         """
-        Returns Dataproc job.
+        Return Dataproc job.
 
         :return: Dataproc job
         :rtype: dict
@@ -222,9 +225,9 @@ class DataprocHook(GoogleBaseHook):
     """
 
     def get_cluster_client(
-        self, region: Optional[str] = None, location: Optional[str] = None
+        self, region: str | None = None, location: str | None = None
     ) -> ClusterControllerClient:
-        """Returns ClusterControllerClient."""
+        """Return ClusterControllerClient."""
         if location is not None:
             warnings.warn(
                 "Parameter `location` will be deprecated. "
@@ -234,17 +237,19 @@ def get_cluster_client(
             )
             region = location
         client_options = None
-        if region and region != 'global':
-            client_options = {'api_endpoint': f'{region}-dataproc.googleapis.com:443'}
+        if region and region != "global":
+            client_options = {"api_endpoint": f"{region}-dataproc.googleapis.com:443"}
 
         return ClusterControllerClient(
-            credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options
+            credentials=self._get_credentials(),
+            client_info=self.client_info,
+            client_options=client_options,
         )
 
     def get_template_client(
-        self, region: Optional[str] = None, location: Optional[str] = None
+        self, region: str | None = None, location: str | None = None
     ) -> WorkflowTemplateServiceClient:
-        """Returns WorkflowTemplateServiceClient."""
+        """Return WorkflowTemplateServiceClient."""
         if location is not None:
             warnings.warn(
                 "Parameter `location` will be deprecated. "
@@ -254,17 +259,19 @@ def get_template_client(
             )
             region = location
         client_options = None
-        if region and region != 'global':
-            client_options = {'api_endpoint': f'{region}-dataproc.googleapis.com:443'}
+        if region and region != "global":
+            client_options = {"api_endpoint": f"{region}-dataproc.googleapis.com:443"}
 
         return WorkflowTemplateServiceClient(
-            credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options
+            credentials=self._get_credentials(),
+            client_info=self.client_info,
+            client_options=client_options,
         )
 
     def get_job_client(
-        self, region: Optional[str] = None, location: Optional[str] = None
+        self, region: str | None = None, location: str | None = None
     ) -> JobControllerClient:
-        """Returns JobControllerClient."""
+        """Return JobControllerClient."""
         if location is not None:
             warnings.warn(
                 "Parameter `location` will be deprecated. "
@@ -274,11 +281,13 @@ def get_job_client(
             )
             region = location
         client_options = None
-        if region and region != 'global':
-            client_options = {'api_endpoint': f'{region}-dataproc.googleapis.com:443'}
+        if region and region != "global":
+            client_options = {"api_endpoint": f"{region}-dataproc.googleapis.com:443"}
 
         return JobControllerClient(
-            credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options
+            credentials=self._get_credentials(),
+            client_info=self.client_info,
+            client_options=client_options,
         )
 
     @GoogleBaseHook.fallback_to_default_project_id
@@ -287,15 +296,15 @@ def create_cluster(
         region: str,
         project_id: str,
         cluster_name: str,
-        cluster_config: Union[Dict, Cluster],
-        labels: Optional[Dict[str, str]] = None,
-        request_id: Optional[str] = None,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        cluster_config: dict | Cluster,
+        labels: dict[str, str] | None = None,
+        request_id: str | None = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ):
         """
-        Creates a cluster in a project.
+        Create a cluster in a project.
 
         :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to.
         :type project_id: str
@@ -326,7 +335,12 @@ def create_cluster(
         # [a-z]([-a-z0-9]*[a-z0-9])? (current airflow version string follows
         # semantic versioning spec: x.y.z).
         labels = labels or {}
-        labels.update({'airflow-version': 'v' + airflow_version.replace('.', '-').replace('+', '-')})
+        labels.update(
+            {
+                "airflow-version": "v"
+                + airflow_version.replace(".", "-").replace("+", "-")
+            }
+        )
 
         cluster = {
             "project_id": project_id,
@@ -338,10 +352,10 @@ def create_cluster(
         client = self.get_cluster_client(region=region)
         result = client.create_cluster(
             request={
-                'project_id': project_id,
-                'region': region,
-                'cluster': cluster,
-                'request_id': request_id,
+                "project_id": project_id,
+                "region": region,
+                "cluster": cluster,
+                "request_id": request_id,
             },
             retry=retry,
             timeout=timeout,
@@ -355,14 +369,14 @@ def delete_cluster(
         region: str,
         cluster_name: str,
         project_id: str,
-        cluster_uuid: Optional[str] = None,
-        request_id: Optional[str] = None,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        cluster_uuid: str | None = None,
+        request_id: str | None = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ):
         """
-        Deletes a cluster in a project.
+        Delete a cluster in a project.
 
         :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to.
         :type project_id: str
@@ -389,11 +403,11 @@ def delete_cluster(
         client = self.get_cluster_client(region=region)
         result = client.delete_cluster(
             request={
-                'project_id': project_id,
-                'region': region,
-                'cluster_name': cluster_name,
-                'cluster_uuid': cluster_uuid,
-                'request_id': request_id,
+                "project_id": project_id,
+                "region": region,
+                "cluster_name": cluster_name,
+                "cluster_uuid": cluster_uuid,
+                "request_id": request_id,
             },
             retry=retry,
             timeout=timeout,
@@ -407,13 +421,12 @@ def diagnose_cluster(
         region: str,
         cluster_name: str,
         project_id: str,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ):
         """
-        Gets cluster diagnostic information. After the operation completes GCS uri to
-        diagnose is returned
+        Get cluster diagnostic information. After the operation completes GCS uri to diagnose is returned.
 
         :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to.
         :type project_id: str
@@ -432,7 +445,11 @@ def diagnose_cluster(
         """
         client = self.get_cluster_client(region=region)
         operation = client.diagnose_cluster(
-            request={'project_id': project_id, 'region': region, 'cluster_name': cluster_name},
+            request={
+                "project_id": project_id,
+                "region": region,
+                "cluster_name": cluster_name,
+            },
             retry=retry,
             timeout=timeout,
             metadata=metadata,
@@ -447,12 +464,12 @@ def get_cluster(
         region: str,
         cluster_name: str,
         project_id: str,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ):
         """
-        Gets the resource representation for a cluster in a project.
+        Get the resource representation for a cluster in a project.
 
         :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to.
         :type project_id: str
@@ -471,7 +488,11 @@ def get_cluster(
         """
         client = self.get_cluster_client(region=region)
         result = client.get_cluster(
-            request={'project_id': project_id, 'region': region, 'cluster_name': cluster_name},
+            request={
+                "project_id": project_id,
+                "region": region,
+                "cluster_name": cluster_name,
+            },
             retry=retry,
             timeout=timeout,
             metadata=metadata,
@@ -484,13 +505,13 @@ def list_clusters(
         region: str,
         filter_: str,
         project_id: str,
-        page_size: Optional[int] = None,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        page_size: int | None = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ):
         """
-        Lists all regions/{region}/clusters in a project.
+        List all regions/{region}/clusters in a project.
 
         :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to.
         :type project_id: str
@@ -513,7 +534,12 @@ def list_clusters(
         """
         client = self.get_cluster_client(region=region)
         result = client.list_clusters(
-            request={'project_id': project_id, 'region': region, 'filter': filter_, 'page_size': page_size},
+            request={
+                "project_id": project_id,
+                "region": region,
+                "filter": filter_,
+                "page_size": page_size,
+            },
             retry=retry,
             timeout=timeout,
             metadata=metadata,
@@ -524,19 +550,19 @@ def list_clusters(
     def update_cluster(
         self,
         cluster_name: str,
-        cluster: Union[Dict, Cluster],
-        update_mask: Union[Dict, FieldMask],
+        cluster: dict | Cluster,
+        update_mask: dict | FieldMask,
         project_id: str,
-        region: str = None,
-        location: Optional[str] = None,
-        graceful_decommission_timeout: Optional[Union[Dict, Duration]] = None,
-        request_id: Optional[str] = None,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        region: str | None = None,
+        location: str | None = None,
+        graceful_decommission_timeout: dict | Duration | None = None,
+        request_id: str | None = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ):
         """
-        Updates a cluster in a project.
+        Update a cluster in a project.
 
         :param project_id: Required. The ID of the Google Cloud project the cluster belongs to.
         :type project_id: str
@@ -609,13 +635,13 @@ def update_cluster(
         client = self.get_cluster_client(region=region)
         operation = client.update_cluster(
             request={
-                'project_id': project_id,
-                'region': region,
-                'cluster_name': cluster_name,
-                'cluster': cluster,
-                'update_mask': update_mask,
-                'graceful_decommission_timeout': graceful_decommission_timeout,
-                'request_id': request_id,
+                "project_id": project_id,
+                "region": region,
+                "cluster_name": cluster_name,
+                "cluster": cluster,
+                "update_mask": update_mask,
+                "graceful_decommission_timeout": graceful_decommission_timeout,
+                "request_id": request_id,
             },
             retry=retry,
             timeout=timeout,
@@ -626,16 +652,16 @@ def update_cluster(
     @GoogleBaseHook.fallback_to_default_project_id
     def create_workflow_template(
         self,
-        template: Union[Dict, WorkflowTemplate],
+        template: dict | WorkflowTemplate,
         project_id: str,
-        region: str = None,
-        location: Optional[str] = None,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        region: str | None = None,
+        location: str | None = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ) -> WorkflowTemplate:
         """
-        Creates new workflow template.
+        Create new workflow template.
 
         :param project_id: Required. The ID of the Google Cloud project the cluster belongs to.
         :type project_id: str
@@ -668,9 +694,12 @@ def create_workflow_template(
                 raise TypeError("missing 1 required keyword argument: 'region'")
         metadata = metadata or ()
         client = self.get_template_client(region)
-        parent = f'projects/{project_id}/regions/{region}'
+        parent = f"projects/{project_id}/regions/{region}"
         return client.create_workflow_template(
-            request={'parent': parent, 'template': template}, retry=retry, timeout=timeout, metadata=metadata
+            request={"parent": parent, "template": template},
+            retry=retry,
+            timeout=timeout,
+            metadata=metadata,
         )
 
     @GoogleBaseHook.fallback_to_default_project_id
@@ -678,17 +707,17 @@ def instantiate_workflow_template(
         self,
         template_name: str,
         project_id: str,
-        region: str = None,
-        location: Optional[str] = None,
-        version: Optional[int] = None,
-        request_id: Optional[str] = None,
-        parameters: Optional[Dict[str, str]] = None,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        region: str | None = None,
+        location: str | None = None,
+        version: int | None = None,
+        request_id: str | None = None,
+        parameters: dict[str, str] | None = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ):  # pylint: disable=too-many-arguments
         """
-        Instantiates a template and begins execution.
+        Instantiate a template and begins execution.
 
         :param template_name: Name of template to instantiate.
         :type template_name: str
@@ -733,9 +762,16 @@ def instantiate_workflow_template(
                 raise TypeError("missing 1 required keyword argument: 'region'")
         metadata = metadata or ()
         client = self.get_template_client(region)
-        name = f'projects/{project_id}/regions/{region}/workflowTemplates/{template_name}'
+        name = (
+            f"projects/{project_id}/regions/{region}/workflowTemplates/{template_name}"
+        )
         operation = client.instantiate_workflow_template(
-            request={'name': name, 'version': version, 'request_id': request_id, 'parameters': parameters},
+            request={
+                "name": name,
+                "version": version,
+                "request_id": request_id,
+                "parameters": parameters,
+            },
             retry=retry,
             timeout=timeout,
             metadata=metadata,
@@ -745,17 +781,17 @@ def instantiate_workflow_template(
     @GoogleBaseHook.fallback_to_default_project_id
     def instantiate_inline_workflow_template(
         self,
-        template: Union[Dict, WorkflowTemplate],
+        template: dict | WorkflowTemplate,
         project_id: str,
-        region: str = None,
-        location: Optional[str] = None,
-        request_id: Optional[str] = None,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        region: str | None = None,
+        location: str | None = None,
+        request_id: str | None = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ):
         """
-        Instantiates a template and begins execution.
+        Instantiate a template and begins execution.
 
         :param template: The workflow template to instantiate. If a dict is provided,
             it must be of the same form as the protobuf message WorkflowTemplate
@@ -792,9 +828,9 @@ def instantiate_inline_workflow_template(
                 raise TypeError("missing 1 required keyword argument: 'region'")
         metadata = metadata or ()
         client = self.get_template_client(region)
-        parent = f'projects/{project_id}/regions/{region}'
+        parent = f"projects/{project_id}/regions/{region}"
         operation = client.instantiate_inline_workflow_template(
-            request={'parent': parent, 'template': template, 'request_id': request_id},
+            request={"parent": parent, "template": template, "request_id": request_id},
             retry=retry,
             timeout=timeout,
             metadata=metadata,
@@ -807,12 +843,12 @@ def wait_for_job(
         job_id: str,
         project_id: str,
         wait_time: int = 10,
-        region: str = None,
-        location: Optional[str] = None,
-        timeout: Optional[int] = None,
+        region: str | None = None,
+        location: str | None = None,
+        timeout: int | None = None,
     ) -> None:
         """
-        Helper method which polls a job to check if it finishes.
+        Poll a job to check if it finishes.
 
         :param job_id: Id of the Dataproc job
         :type job_id: str
@@ -840,34 +876,43 @@ def wait_for_job(
                 raise TypeError("missing 1 required keyword argument: 'region'")
         state = None
         start = time.monotonic()
-        while state not in (JobStatus.State.ERROR, JobStatus.State.DONE, JobStatus.State.CANCELLED):
+        while state not in (
+            JobStatus.State.ERROR,
+            JobStatus.State.DONE,
+            JobStatus.State.CANCELLED,
+        ):
             if timeout and start + timeout < time.monotonic():
-                raise AirflowException(f"Timeout: dataproc job {job_id} is not ready after {timeout}s")
+                raise AirflowException(
+                    f"Timeout: dataproc job {job_id} is not ready after {timeout}s"
+                )
             time.sleep(wait_time)
             try:
                 job = self.get_job(project_id=project_id, region=region, job_id=job_id)
                 state = job.status.state
             except ServerError as err:
-                self.log.info("Retrying. Dataproc API returned server error when waiting for job: %s", err)
+                self.log.info(
+                    "Retrying. Dataproc API returned server error when waiting for job: %s",
+                    err,
+                )
 
         if state == JobStatus.State.ERROR:
-            raise AirflowException(f'Job failed:\n{job}')
+            raise AirflowException(f"Job failed:\n{job}")
         if state == JobStatus.State.CANCELLED:
-            raise AirflowException(f'Job was cancelled:\n{job}')
+            raise AirflowException(f"Job was cancelled:\n{job}")
 
     @GoogleBaseHook.fallback_to_default_project_id
     def get_job(
         self,
         job_id: str,
         project_id: str,
-        region: str = None,
-        location: Optional[str] = None,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        region: str | None = None,
+        location: str | None = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ) -> Job:
         """
-        Gets the resource representation for a job in a project.
+        Get the resource representation for a job in a project.
 
         :param job_id: Id of the Dataproc job
         :type job_id: str
@@ -899,7 +944,7 @@ def get_job(
                 raise TypeError("missing 1 required keyword argument: 'region'")
         client = self.get_job_client(region=region)
         job = client.get_job(
-            request={'project_id': project_id, 'region': region, 'job_id': job_id},
+            request={"project_id": project_id, "region": region, "job_id": job_id},
             retry=retry,
             timeout=timeout,
             metadata=metadata,
@@ -909,17 +954,17 @@ def get_job(
     @GoogleBaseHook.fallback_to_default_project_id
     def submit_job(
         self,
-        job: Union[dict, Job],
+        job: dict | Job,
         project_id: str,
-        region: str = None,
-        location: Optional[str] = None,
-        request_id: Optional[str] = None,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        region: str | None = None,
+        location: str | None = None,
+        request_id: str | None = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ) -> Job:
         """
-        Submits a job to a cluster.
+        Submit a job to a cluster.
 
         :param job: The job resource. If a dict is provided,
             it must be of the same form as the protobuf message Job
@@ -956,7 +1001,12 @@ def submit_job(
                 raise TypeError("missing 1 required keyword argument: 'region'")
         client = self.get_job_client(region=region)
         return client.submit_job(
-            request={'project_id': project_id, 'region': region, 'job': job, 'request_id': request_id},
+            request={
+                "project_id": project_id,
+                "region": region,
+                "job": job,
+                "request_id": request_id,
+            },
             retry=retry,
             timeout=timeout,
             metadata=metadata,
@@ -966,11 +1016,11 @@ def submit(
         self,
         project_id: str,
         job: dict,
-        region: str = 'global',
-        job_error_states: Optional[Iterable[str]] = None,
+        region: str = "global",
+        job_error_states: Iterable[str] | None = None,
     ) -> None:
         """
-        Submits Google Cloud Dataproc job.
+        Submit Google Cloud Dataproc job.
 
         :param project_id: The id of Google Cloud Dataproc project.
         :type project_id: str
@@ -982,7 +1032,11 @@ def submit(
         :type job_error_states: List[str]
         """
         # TODO: Remover one day
-        warnings.warn("This method is deprecated. Please use `submit_job`", DeprecationWarning, stacklevel=2)
+        warnings.warn(
+            "This method is deprecated. Please use `submit_job`",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         job_object = self.submit_job(region=region, project_id=project_id, job=job)
         job_id = job_object.reference.job_id
         self.wait_for_job(job_id=job_id, region=region, project_id=project_id)
@@ -992,14 +1046,14 @@ def cancel_job(
         self,
         job_id: str,
         project_id: str,
-        region: Optional[str] = None,
-        location: Optional[str] = None,
-        retry: Optional[Retry] = None,
-        timeout: Optional[float] = None,
-        metadata: Optional[Sequence[Tuple[str, str]]] = None,
+        region: str | None = None,
+        location: str | None = None,
+        retry: Retry | None = None,
+        timeout: float | None = None,
+        metadata: Sequence[tuple[str, str]] | None = None,
     ) -> Job:
         """
-        Starts a job cancellation request.
+        Start a job cancellation request.
 
         :param project_id: Required. The ID of the Google Cloud project that the job belongs to.
         :type project_id: str
@@ -1018,15 +1072,14 @@ def cancel_job(
         :param metadata: Additional metadata that is provided to the method.
         :type metadata: Sequence[Tuple[str, str]]
         """
-        if region is None:
-            if location is not None:
-                warnings.warn(
-                    "Parameter `location` will be deprecated. "
-                    "Please provide value through `region` parameter instead.",
-                    DeprecationWarning,
-                    stacklevel=1,
-                )
-                region = location
+        if region is None and location is not None:
+            warnings.warn(
+                "Parameter `location` will be deprecated. "
+                "Please provide value through `region` parameter instead.",
+                DeprecationWarning,
+                stacklevel=1,
+            )
+            region = location
 
         if region is None:
             warnings.warn(
@@ -1034,14 +1087,13 @@ def cancel_job(
                 DeprecationWarning,
                 stacklevel=2,
             )
-            region = 'global'
+            region = "global"
         client = self.get_job_client(region=region)
 
         job = client.cancel_job(
-            request={'project_id': project_id, 'region': region, 'job_id': job_id},
+            request={"project_id": project_id, "region": region, "job_id": job_id},
             retry=retry,
             timeout=timeout,
             metadata=metadata,
         )
         return job
-
diff --git a/dags/utils/slack.py b/utils/slack.py
similarity index 72%
rename from dags/utils/slack.py
rename to utils/slack.py
index 690dbf31b..4655a0f6c 100644
--- a/dags/utils/slack.py
+++ b/utils/slack.py
@@ -3,9 +3,10 @@
 
 SLACK_CHANNEL = "#airflow-alerts"
 
+
 def if_task_fails_alert_slack(context):
     failed_alert = SlackAPIPostOperator(
-        task_id='slack_failed',
+        task_id="slack_failed",
         channel=SLACK_CHANNEL,
         token=Variable.get("slack_secret_token"),
         text="""
@@ -14,9 +15,9 @@ def if_task_fails_alert_slack(context):
             *Dag*: {dag}
             *Date*: {ds}
             """.format(
-                task=context.get('task_instance').task_id,
-                dag=context.get('task_instance').dag_id,
-                ds=context.get('ds')
-            )
+            task=context.get("task_instance").task_id,
+            dag=context.get("task_instance").dag_id,
+            ds=context.get("ds"),
+        ),
     )
     return failed_alert.execute(context=context)
diff --git a/dags/utils/tags.py b/utils/tags.py
similarity index 85%
rename from dags/utils/tags.py
rename to utils/tags.py
index 69a153961..20bf39896 100644
--- a/dags/utils/tags.py
+++ b/utils/tags.py
@@ -16,6 +16,10 @@ def __getattr__(self, item: str) -> str:
 
         Instead of Tag.ImpactTier.value.tier_1.value we can
         just use Tag.ImpactTier.tier_1.
+        Simplify accessing enum values.
+
+        Instead of Tag.ImpactTier.value.tier_1.value we can just use
+        Tag.ImpactTier.tier_1.
 
         # source: https://newbedev.com/enum-of-enums-in-python
         """
@@ -27,6 +31,7 @@ def __getattr__(self, item: str) -> str:
             ret_val = getattr(self.value, item).value
         except AttributeError as _err:
             raise InvalidTagError(_err) from None
+            raise InvalidTagError() from _err
 
         return ret_val