diff --git a/.gitignore b/.gitignore index 59b7212de..c6bbebff9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ venv logs unittests.cfg airflow-webserver.pid +airflow-worker.pid .config .credentials diff --git a/Dockerfile b/Dockerfile index 5609e984b..7b2a4ad1b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # https://forums.docker.com/t/multiple-projects-stopped-building-on-docker-hub-operation-not-permitted/92570/6 # and https://forums.docker.com/t/multiple-projects-stopped-building-on-docker-hub-operation-not-permitted/92570/11 FROM python:3.7-slim-buster -MAINTAINER Jannis Leidel +MAINTAINER Harold Woo # Due to AIRFLOW-6854, Python 3.7 is chosen as the base python version. diff --git a/airflow.cfg b/airflow.cfg index 3bd350cd8..7e16364fd 100644 --- a/airflow.cfg +++ b/airflow.cfg @@ -1,15 +1,16 @@ [core] -# 1.10 additions default_timezone = utc -log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ execution_date.strftime("%%Y-%%m-%%dT%%H:%%M:%%S") }}/{{ try_number }}.log + +hide_sensitive_var_conn_fields = True +sensitive_var_conn_names = 'cred,CRED,secret,SECRET,pass,PASS,password,PASSWORD,private,PRIVATE,key,KEY,cert,CERT,token,TOKEN,AKIA' + +# This setting would not have any effect in an existing deployment where the default_pool already exists. +# default_pool_task_slot_count = 50 # The folder where your airflow pipelines live, most likely a # subfolder in a code repository dags_folder = $AIRFLOW_HOME/dags -# The folder where airflow should store its log files. This location -base_log_folder = $AIRFLOW_HOME/logs - # The executor class that airflow should use. Choices include # SequentialExecutor, LocalExecutor, CeleryExecutor executor = CeleryExecutor @@ -34,7 +35,7 @@ sql_alchemy_pool_recycle = 3600 parallelism = 16 # The number of task instances allowed to run concurrently by the scheduler -dag_concurrency = 16 +max_active_tasks_per_dag = 16 # Are DAGs paused by default at creation dags_are_paused_at_creation = True @@ -47,9 +48,20 @@ max_active_runs_per_dag = 5 # environment load_examples = False +# Whether to load the default connections that ship with Airflow. It's good to +# get started, but you probably want to set this to ``False`` in a production +# environment +# We have configured google_cloud_default, so hopefully this wont remove it. +load_default_connections = False + # Where your Airflow plugins are stored plugins_folder = $AIRFLOW_HOME/plugins +# Should tasks be executed via forking of the parent process ("False", +# the speedier option) or by spawning a new python process ("True" slow, +# but means plugin changes picked up by tasks straight away) +execute_tasks_new_python_interpreter = False + # Secret key to save connection passwords in the db # Setting this to $AIRFLOW_FERNET_KEY is broken in 1.9 for initdb. Set $AIRFLOW__CORE__FERNET_KEY instead # fernet_key = @@ -58,15 +70,162 @@ plugins_folder = $AIRFLOW_HOME/plugins donot_pickle = False # How long before timing out a python file import while filling the DagBag -dagbag_import_timeout = 30 +dagbag_import_timeout = 30.0 + +# Should a traceback be shown in the UI for dagbag import errors, +# instead of just the exception message +dagbag_import_error_tracebacks = True + +# If tracebacks are shown, how many entries from the traceback should be shown +dagbag_import_error_traceback_depth = 2 + +# How long before timing out a DagFileProcessor, which processes a dag file +dag_file_processor_timeout = 50 + +# The class to use for running task instances in a subprocess. +# Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class +# when using a custom task runner. +task_runner = StandardTaskRunner + +# If set, tasks without a ``run_as_user`` argument will be run with this user +# Can be used to de-elevate a sudo user running Airflow when executing tasks +# default_impersonation = + +# What security module to use (for example kerberos) +# security = + +# Turn unit test mode on (overwrites many configuration options with test +# values at runtime) +unit_test_mode = False + +# Whether to enable pickling for xcom (note that this is insecure and allows for +# RCE exploits). +enable_xcom_pickling = False + +# Whether to override params with dag_run.conf. If you pass some key-value pairs +# through ``airflow dags backfill -c`` or +# ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params. +dag_run_conf_overrides_params = True + +# When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``. +dag_discovery_safe_mode = False + +# The number of retries each task is going to have by default. Can be overridden at dag or task level. +default_task_retries = 0 + +# We will override the next 2 intervals in prod via env vars. +# Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. +# This flag sets the minimum interval (in seconds) after which the serialized DAGs in the DB should be updated. +# This helps in reducing database write rate. +min_serialized_dag_update_interval = 10 + +# Fetching serialized DAG can not be faster than a minimum interval to reduce database +# read rate. This config controls when your DAGs are updated in the Webserver +min_serialized_dag_fetch_interval = 5 + +# Whether to persist DAG files code in DB. +# If set to True, Webserver reads file contents from DB instead of +# trying to access files in a DAG folder. +# (Default is ``True``) +# Example: store_dag_code = True +# store_dag_code = + +# Maximum number of Rendered Task Instance Fields (Template Fields) per task to store +# in the Database. +# All the template_fields for each of Task Instance are stored in the Database. +# Keeping this number small may cause an error when you try to view ``Rendered`` tab in +# TaskInstance view for older tasks. +max_num_rendered_ti_fields_per_task = 30 + +# On each dagrun check against defined SLAs +check_slas = True + +# Path to custom XCom class that will be used to store and resolve operators results +# Example: xcom_backend = path.to.CustomXCom +xcom_backend = airflow.models.xcom.BaseXCom + +# By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``, +# if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module. +lazy_load_plugins = True + +# By default Airflow providers are lazily-discovered (discovery and imports happen only when required). +# Set it to False, if you want to discover providers whenever 'airflow' is invoked via cli or +# loaded from module. +lazy_discover_providers = True + +# Number of times the code should be retried in case of DB Operational Errors. +# Not all transactions will be retried as it can cause undesired state. +# Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``. +max_db_retries = 3 + + +[logging] +# The folder where airflow should store its log files. This location +base_log_folder = $AIRFLOW_HOME/logs + +# Logging level. +# +# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. +logging_level = INFO + +# Logging level for Flask-appbuilder UI. +# +# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. +fab_logging_level = WARN + +# Logging class +# Specify the class that will specify the logging configuration +# This class has to be on the python classpath +# Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG +# logging_config_class = + +# Flag to enable/disable Colored logs in Console +# Colour the logs when the controlling terminal is a TTY. +colored_console_log = True + +# Log format for when Colored logs is enabled +colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {{%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d}} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s +colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter + +# Format of Log line +log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s +simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s + +# Specify prefix pattern like mentioned below with stream handler TaskHandlerWithCustomFormatter +# Example: task_log_prefix_template = {{ti.dag_id}}-{{ti.task_id}}-{{execution_date}}-{{try_number}} +# task_log_prefix_template = + +# Formatting for how airflow generates file names/paths for each task run. +log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ execution_date.strftime("%%Y-%%m-%%dT%%H:%%M:%%S") }}/{{ try_number }}.log + +# Formatting for how airflow generates file names for log +log_processor_filename_template = {{ filename }}.log + +# full path of dag_processor_manager logfile +dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log + +# Name of handler to read task instance logs. +# Defaults to use ``task`` handler. +task_log_reader = task + +# A comma\-separated list of third-party logger names that will be configured to print messages to +# consoles\. +# Example: extra_loggers = connexion,sqlalchemy +# extra_loggers = + [webserver] -rbac = $WEBSERVER_USE_RBAC # The base url of your website as airflow cannot guess what domain or # cname you are using. This is use in automated emails that # airflow sends to point links to the right web server base_url = $URL +# Default timezone to display all dates in the UI, can be UTC, system, or +# any IANA timezone string (e.g. Europe/Amsterdam). If left empty the +# default value of core/default_timezone will be used +# Example: default_ui_timezone = America/New_York +default_ui_timezone = UTC + # The ip specified when starting the web server web_server_host = 0.0.0.0 @@ -83,19 +242,151 @@ workers = 4 # sync (default), eventlet, gevent worker_class = gevent -# Expose the configuration file in the web server -expose_config = true - # Set to true to turn on authentication : http://pythonhosted.org/airflow/installation.html#web-authentication -authenticate = $AIRFLOW_AUTHENTICATE auth_backend = $AIRFLOW_AUTH_BACKEND -# Filter the list of dags by owner name (requires authentication to be enabled) -filter_by_owner = False +# Paths to the SSL certificate and key for the web server. When both are +# provided SSL will be enabled. This does not change the web server port. +# web_server_ssl_cert = + +# Paths to the SSL certificate and key for the web server. When both are +# provided SSL will be enabled. This does not change the web server port. +# web_server_ssl_key = + +# If set to True, Airflow will track files in plugins_folder directory. When it detects changes, +# then reload the gunicorn. +# You can toggle this for Development when iterating on plugins +reload_on_plugin_change = False + +# Log files for the gunicorn webserver. '-' means log to stderr. +access_logfile = - + +# Log files for the gunicorn webserver. '-' means log to stderr. +error_logfile = - + +# Access log format for gunicorn webserver. +# default format is %%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s" +# documentation - https://docs.gunicorn.org/en/stable/settings.html#access-log-format +# access_logformat = + +# Expose the configuration file in the web server +expose_config = True + +# Expose hostname in the web server +expose_hostname = True + +# Expose stacktrace in the web server +expose_stacktrace = True + +# Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times`` +dag_default_view = tree + +# Default DAG orientation. Valid values are: +# ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top) +dag_orientation = LR + +# The amount of time (in secs) webserver will wait for initial handshake +# while fetching logs from other worker machine +log_fetch_timeout_sec = 5 + +# Time interval (in secs) to wait before next log fetching. +log_fetch_delay_sec = 2 + +# Distance away from page bottom to enable auto tailing. +log_auto_tailing_offset = 30 + +# Animation speed for auto tailing log display. +log_animation_speed = 1000 + +# By default, the webserver shows paused DAGs. Flip this to hide paused +# DAGs by default +hide_paused_dags_by_default = False + +# Consistent page size across all listing views in the UI +page_size = 100 + +# Define the color of navigation bar +navbar_color = #fff + +# Default dagrun to show in UI +default_dag_run_display_number = 25 + +# Enable werkzeug ``ProxyFix`` middleware for reverse proxy +enable_proxy_fix = False + +# Number of values to trust for ``X-Forwarded-For``. +# More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/ +proxy_fix_x_for = 1 + +# Number of values to trust for ``X-Forwarded-Proto`` +proxy_fix_x_proto = 1 + +# Number of values to trust for ``X-Forwarded-Host`` +proxy_fix_x_host = 1 + +# Number of values to trust for ``X-Forwarded-Port`` +proxy_fix_x_port = 1 + +# Number of values to trust for ``X-Forwarded-Prefix`` +proxy_fix_x_prefix = 1 + +# Set secure flag on session cookie +cookie_secure = False + +# Set samesite policy on session cookie +cookie_samesite = Lax + +# Default setting for wrap toggle on DAG code and TI log views. +default_wrap = False + +# Allow the UI to be rendered in a frame +x_frame_enabled = True + +# Send anonymous user activity to your analytics tool +# choose from google_analytics, segment, or metarouter +# analytics_tool = + +# Unique ID of your account in the analytics tool +# analytics_id = + +# 'Recent Tasks' stats will show for old DagRuns if set +show_recent_stats_for_completed_runs = True + +# Update FAB permissions and sync security manager roles +# on webserver startup +update_fab_perms = True + +# The UI cookie lifetime in minutes. User will be logged out from UI after +# ``session_lifetime_minutes`` of non-activity +session_lifetime_minutes = 43200 + +# Sets a custom page title for the DAGs overview page and site title for all pages +# instance_name = + [email] email_backend = $AIRFLOW_EMAIL_BACKEND +# Email connection to use +# email_conn_id = smtp_default + +# Whether email alerts should be sent when a task is retried +default_email_on_retry = True + +# Whether email alerts should be sent when a task failed +default_email_on_failure = True + +# File that will be used as the template for Email subject (which will be rendered using Jinja2). +# If not set, Airflow uses a base template. +# Example: subject_template = /path/to/my_subject_template_file +# subject_template = + +# File that will be used as the template for Email content (which will be rendered using Jinja2). +# If not set, Airflow uses a base template. +# Example: html_content_template = /path/to/my_html_content_template_file +# html_content_template = + + [smtp] # If you want airflow to send emails on retries, failure, and you want to # the airflow.utils.send_email function, you have to configure an smtp @@ -107,6 +398,30 @@ smtp_port = 587 smtp_user = $AIRFLOW_SMTP_USER smtp_password = $AIRFLOW_SMTP_PASSWORD smtp_mail_from = $AIRFLOW_SMTP_FROM +# smtp_timeout = 30 +# smtp_retry_limit = 5 + + +[sentry] +# Sentry (https://docs.sentry.io) integration. Here you can supply +# additional configuration options based on the Python platform. See: +# https://docs.sentry.io/error-reporting/configuration/?platform=python. +# Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``, +# ``ignore_errors``, ``before_breadcrumb``, ``before_send``, ``transport``. +# Enable error reporting to Sentry +# sentry_on = false +# sentry_dsn = + + +[celery_kubernetes_executor] +# This section only applies if you are using the ``CeleryKubernetesExecutor`` in +# ``[core]`` section above +# Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``. +# When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``), +# the task is executed via ``KubernetesExecutor``, +# otherwise via ``CeleryExecutor`` +# kubernetes_queue = kubernetes + [celery] # This section only applies if you are using the CeleryExecutor in @@ -121,6 +436,30 @@ celery_app_name = airflow.executors.celery_executor # your worker box and the nature of your tasks worker_concurrency = 32 +# The maximum and minimum concurrency that will be used when starting workers with the +# ``airflow celery worker`` command (always keep minimum processes, but grow +# to maximum if necessary). Note the value should be max_concurrency,min_concurrency +# Pick these numbers based on resources on worker box and the nature of the task. +# If autoscale option is available, worker_concurrency will be ignored. +# http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale +# Example: worker_autoscale = 16,12 +# worker_autoscale = + +# Used to increase the number of tasks that a worker prefetches which can improve performance. +# The number of processes multiplied by worker_prefetch_multiplier is the number of tasks +# that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily +# blocked if there are multiple workers and one worker prefetches tasks that sit behind long +# running tasks while another worker has unutilized processes that are unable to process the already +# claimed blocked tasks. +# https://docs.celeryproject.org/en/stable/userguide/optimizing.html#prefetch-limits +# Example: worker_prefetch_multiplier = 1 +# worker_prefetch_multiplier = + +# Umask that will be used when starting workers with the ``airflow celery worker`` +# in daemon mode. This control the file-creation mode mask which determines the initial +# value of file permission bits for newly created files. +# worker_umask = 0o077 + # When you start an airflow worker, airflow starts a tiny web server # subprocess to serve the workers local log files to the airflow main # web server, who then builds pages and sends them to users. This defines @@ -136,36 +475,151 @@ broker_url = $AIRFLOW_BROKER_URL # Another key Celery setting result_backend = $AIRFLOW_RESULT_URL +# Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start +# it ``airflow celery flower``. This defines the IP that Celery Flower runs on +flower_host = 0.0.0.0 + +# The root URL for Flower +# Example: flower_url_prefix = /flower +# flower_url_prefix = + # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start # it `airflow flower`. This defines the port that Celery Flower runs on flower_port = $AIRFLOW_FLOWER_PORT +# Securing Flower with Basic Authentication +# Accepts user:password pairs separated by a comma +# Example: flower_basic_auth = user1:password1,user2:password2 +# flower_basic_auth = + +# How many processes CeleryExecutor uses to sync task state. +# 0 means to use max(1, number of cores - 1) processes. +sync_parallelism = 0 + +# Import path for celery configuration options +celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG +ssl_active = False +# ssl_key = +# ssl_cert = +# ssl_cacert = + +# Celery Pool implementation. +# Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``. +# See: +# https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency +# https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html +pool = prefork + +# The number of seconds to wait before timing out ``send_task_to_executor`` or +# ``fetch_celery_task_state`` operations. +operation_timeout = 3.0 + +# Celery task will report its status as 'started' when the task is executed by a worker. +# This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted +# or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob. +task_track_started = True + +# Time in seconds after which Adopted tasks are cleared by CeleryExecutor. This is helpful to clear +# stalled tasks. +task_adoption_timeout = 600 + +# The Maximum number of retries for publishing task messages to the broker when failing +# due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed. +task_publish_max_retries = 3 + +# Worker initialisation check to validate Metadata Database connection +worker_precheck = False + + +# [dask] +# This section only applies if you are using the DaskExecutor in +# [core] section above +# The IP address and port of the Dask cluster's scheduler. +# cluster_address = 127.0.0.1:8786 + +# TLS/ SSL settings to access a secured Dask scheduler. +# tls_ca = +# tls_cert = +# tls_key = + + +[celery_broker_transport_options] +# This section is for specifying options which can be passed to the +# underlying celery broker transport. See: +# http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options +# The visibility timeout defines the number of seconds to wait for the worker +# to acknowledge the task before the message is redelivered to another worker. +# Make sure to increase the visibility timeout to match the time of the longest +# ETA you're planning to use. +# visibility_timeout is only supported for Redis and SQS celery brokers. +# See: +# http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options +# Example: visibility_timeout = 21600 +# visibility_timeout = + + +[operators] # Default queue that tasks get assigned to and that worker listen on. default_queue = default +# The default owner assigned to each new operator, unless +# provided explicitly or passed via ``default_args`` +# default_owner = airflow +# default_cpus = 1 +# default_ram = 512 +# default_disk = 512 +# default_gpus = 0 + +# Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator. +# If set to False, an exception will be thrown, otherwise only the console message will be displayed. +allow_illegal_arguments = False + + [scheduler] # Task instances listen for external kill signal (when you clear tasks # from the CLI or the UI), this defines the frequency at which they should # listen (in seconds). job_heartbeat_sec = 5 +# How often (in seconds) to check and tidy up 'running' TaskInstancess +# that no longer have a matching DagRun +clean_tis_without_dagrun_interval = 15.0 + # The scheduler constantly tries to trigger new tasks (look at the # scheduler section in the docs for more information). This defines # how often the scheduler should run (in seconds). scheduler_heartbeat_sec = 5 -# after how much time should the scheduler terminate in seconds -# -1 indicates to run continuously (see also num_runs) -run_duration = -1 +# The number of times to try to schedule each DAG file +# -1 indicates unlimited number +num_runs = -1 -# after how much time a new DAGs should be picked up from the filesystem -min_file_process_interval = 0 +# The number of seconds to wait between consecutive DAG file processing +# Deprecated since version 2.2.0: The option has been moved to scheduler.scheduler_idle_sleep_time +processor_poll_interval = 1 -dag_dir_list_interval = 300 +# Number of seconds after which a DAG file is parsed. The DAG file is parsed every +# ``min_file_process_interval`` number of seconds. Updates to DAGs are reflected after +# this interval. Keeping this number low will increase CPU usage. +min_file_process_interval = 60 -# How often should stats be printed to the logs +# How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes. +# This is set via env var to 300 in prod, but 30 for local testing +dag_dir_list_interval = 30 + +# How often should stats be printed to the logs. Setting to 0 will disable printing stats print_stats_interval = 30 +# How often (in seconds) should pool usage stats be sent to statsd (if statsd_on is enabled) +pool_metrics_interval = 20.0 + +# If the last scheduler heartbeat happened more than scheduler_health_check_threshold +# ago (in seconds), scheduler is considered unhealthy. +# This is used by the health check in the "/health" endpoint +scheduler_health_check_threshold = 30 + +# How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs +orphaned_tasks_check_interval = 300.0 child_process_log_directory = ${AIRFLOW_HOME}/logs/scheduler # Local task jobs periodically heartbeat to the DB. If the job has @@ -173,22 +627,185 @@ child_process_log_directory = ${AIRFLOW_HOME}/logs/scheduler # associated task instance as failed and will re-schedule the task. scheduler_zombie_task_threshold = 300 - # Turn off scheduler catchup by setting this to False. # Default behavior is unchanged and # Command Line Backfills still work, but the scheduler # will not do scheduler catchup if this is False, # however it can be set on a per DAG basis in the # DAG definition (catchup) -catchup_by_default = True - - +catchup_by_default = False + +# This changes the batch size of queries in the scheduling main loop. +# If this is too high, SQL query performance may be impacted by one +# or more of the following: +# - reversion to full table scan +# - complexity of query predicate +# - excessive locking +# Additionally, you may hit the maximum allowable query length for your db. +# Set this to 0 for no limit (not advised) +max_tis_per_query = 512 + +# Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries. +# If this is set to False then you should not run more than a single +# scheduler at once +use_row_level_locking = True + +# Max number of DAGs to create DagRuns for per scheduler loop +# +# Default: 10 +# max_dagruns_to_create_per_loop = + +# How many DagRuns should a scheduler examine (and lock) when scheduling +# and queuing tasks. +# +# Default: 20 +# max_dagruns_per_loop_to_schedule = + +# Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the +# same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other +# dags in some circumstances +# +# Default: True +# schedule_after_task_execution = + +# The scheduler can run multiple processes in parallel to parse dags. +# This defines how many processes will run. +parsing_processes = 2 + +# One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``. +# The scheduler will list and sort the dag files to decide the parsing order. +# +# * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the +# recently modified DAGs first. +# * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the +# same host. This is useful when running with Scheduler in HA mode where each scheduler can +# parse different DAG files. +# * ``alphabetical``: Sort by filename +file_parsing_sort_mode = modified_time + +# Turn off scheduler use of cron intervals by setting this to False. +# DAGs submitted manually in the web UI or with trigger_dag will still run. +use_job_schedule = True + +# Allow externally triggered DagRuns for Execution Dates in the future +# Only has effect if schedule_interval is set to None in DAG +allow_trigger_in_future = False + +# DAG dependency detector class to use +dependency_detector = airflow.serialization.serialized_objects.DependencyDetector + + +[metrics] # Statsd (https://github.com/etsy/statsd) integration settings # statsd_on = False # statsd_host = localhost # statsd_port = 8125 # statsd_prefix = airflow +# To enable datadog integration to send airflow metrics. +statsd_datadog_enabled = False + +# List of datadog tags attached to all metrics(e.g: key1:value1,key2:value2) +# statsd_datadog_tags = + + +# [secrets] +# Full class name of secrets backend to enable (will precede env vars and metastore in search path) +# Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend +# backend = + +# The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class. +# See documentation for the secrets backend you are using. JSON is expected. +# Example for AWS Systems Manager ParameterStore: +# ``{{"connections_prefix": "/airflow/connections", "profile_name": "default"}}`` +# backend_kwargs = + +# [cli] +# In what way should the cli access the API. The LocalClient will use the +# database directly, while the json_client will use the api running on the +# webserver +# api_client = airflow.api.client.local_client + +# If you set web_server_url_prefix, do NOT forget to append it here, ex: +# ``endpoint_url = http://localhost:8080/myroot`` +# So api will look like: ``http://localhost:8080/myroot/api/experimental/...`` +# endpoint_url = http://localhost:8080 + + +[debug] +# Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first +# failed task. Helpful for debugging purposes. +fail_fast = False + + +[api] +# Enables the deprecated experimental API. Please note that these APIs do not have access control. +# The authenticated user has full access. +# +# .. warning:: +# +# This `Experimental REST API `__ is +# deprecated since version 2.0. Please consider using +# `the Stable REST API `__. +# For more information on migration, see +# `UPDATING.md `_ +enable_experimental_api = False + +# How to authenticate users of the API. See +# https://airflow.apache.org/docs/apache-airflow/stable/security.html for possible values. +# ("airflow.api.auth.backend.default" allows all requests for historic reasons) +auth_backend = airflow.api.auth.backend.deny_all + +# Used to set the maximum page limit for API requests +maximum_page_limit = 100 + +# Used to set the default page limit when limit is zero. A default limit +# of 100 is set on OpenApi spec. However, this particular default limit +# only work when limit is set equal to zero(0) from API requests. +# If no limit is supplied, the OpenApi spec default is used. +fallback_page_limit = 100 + +# The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested. +# Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com +# google_oauth2_audience = + +# Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on +# `the Application Default Credentials +# `__ will +# be used. +# Example: google_key_path = /files/service-account-json +# google_key_path = + +# Used in response to a preflight request to indicate which HTTP +# headers can be used when making the actual request. This header is +# the server side response to the browser's +# Access-Control-Request-Headers header. +# access_control_allow_headers = + +# Specifies the method or methods allowed when accessing the resource. +# access_control_allow_methods = + +# Indicates whether the response can be shared with requesting code from the given origin. +# access_control_allow_origin = + + +# [smart_sensor] +# TODO(hwoo) - Test smart sensors and enable this if the need arises. +# When `use_smart_sensor` is True, Airflow redirects multiple qualified sensor tasks to +# smart sensor task. +# use_smart_sensor = False + +# `shard_code_upper_limit` is the upper limit of `shard_code` value. The `shard_code` is generated +# by `hashcode % shard_code_upper_limit`. +# shard_code_upper_limit = 10000 + +# The number of running smart sensor processes for each service. +# shards = 5 + +# comma separated sensor classes support in smart_sensor. +# sensors_enabled = NamedHivePartitionSensor + + [mesos] # Mesos master address which MesosExecutor will connect to. master = localhost:5050 @@ -223,3 +840,35 @@ authenticate = False # Mesos credentials, if authentication is enabled # default_principal = admin # default_secret = admin + + +# [lineage] +# what lineage backend to use +# backend = + +# [atlas] +# sasl_enabled = False +# host = +# port = 21000 +# username = +# password = + +# [hive] +# Default mapreduce queue for HiveOperator tasks +# default_hive_mapred_queue = + +# Template for mapred_job_name in HiveOperator, supports the following named parameters +# hostname, dag_id, task_id, execution_date +# mapred_job_name_template = + +# [kerberos] +# ccache = /tmp/airflow_krb5_ccache + +# gets augmented with fqdn +# principal = airflow +# reinit_frequency = 3600 +# kinit_path = kinit +# keytab = airflow.keytab + +# [github_enterprise] +# api_rev = v3 diff --git a/bin/add_gcp_creds b/bin/add_gcp_creds index 731363e56..adf5d4082 100755 --- a/bin/add_gcp_creds +++ b/bin/add_gcp_creds @@ -33,13 +33,12 @@ function update_gcp() { container_id=$(docker ps | grep telemetry-airflow_web | cut -d' ' -f1) docker exec $container_id \ - airflow connections -d --conn_id $conn_id + airflow connections delete $conn_id docker exec $container_id \ - airflow connections -a \ - --conn_id $conn_id \ - --conn_type google_cloud_platform \ - --conn_extra "$(format_gcp $keyfile)" + airflow connections add $conn_id \ + --conn-type google_cloud_platform \ + --conn-extra "$(format_gcp $keyfile)" } update_gcp $connection $keyfile_path diff --git a/bin/run b/bin/run index 89f00f79d..6b1f0c0b1 100755 --- a/bin/run +++ b/bin/run @@ -68,13 +68,12 @@ init_connections() { export AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-dummy_access_key_id} export AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-dummy_secret_access_key} - airflow connections --delete --conn_id databricks_default + airflow connections delete databricks_default - airflow connections --add \ - --conn_id databricks_default \ - --conn_type databricks \ - --conn_host https://dbc-caf9527b-e073.cloud.databricks.com \ - --conn_extra "{\"token\":\"${DB_TOKEN}\", \"host\": \"\"}" + airflow connections add databricks_default \ + --conn-type databricks \ + --conn-host https://dbc-caf9527b-e073.cloud.databricks.com \ + --conn-extra "{\"token\":\"${DB_TOKEN}\", \"host\": \"\"}" gcp_conn=( "google_cloud_airflow_dataproc" @@ -87,11 +86,11 @@ init_connections() { "google_cloud_shared_prod" ) for conn_id in "${gcp_conn[@]}"; do - airflow connections --delete --conn_id "${conn_id}" - airflow connections --add \ - --conn_id "${conn_id}" \ - --conn_type google_cloud_platform \ - --conn_extra "$(gcp_default_extras)" + airflow connections delete "${conn_id}" + + airflow connections add "${conn_id}" \ + --conn-type google_cloud_platform \ + --conn-extra "$(gcp_default_extras)" done aws_conn=( @@ -107,46 +106,46 @@ init_connections() { "aws_socorro_readonly_s3" ) for conn_id in "${aws_conn[@]}"; do - airflow connections --delete --conn_id "${conn_id}" - airflow connections --add \ - --conn_id "${conn_id}" \ - --conn_type s3 \ - --conn_extra "$(aws_default_extras)" + airflow connections delete "${conn_id}" + + airflow connections add "${conn_id}" \ + --conn-type s3 \ + --conn-extra "$(aws_default_extras)" done - airflow connections --delete --conn_id "http_netlify_build_webhook" - airflow connections --add \ - --conn_id "http_netlify_build_webhook" \ - --conn_type http \ - --conn_host "https://httpbin.org/" + airflow connections delete "http_netlify_build_webhook" + + airflow connections add "http_netlify_build_webhook" \ + --conn-type http \ + --conn-host "https://httpbin.org/" } init_variables() { - airflow variables -s "bugzilla_probe_expiry_bot_api_key" "bugzilla-api-key" - airflow variables -s "app_store_connect_username" "username" - airflow variables -s "app_store_connect_password" "password" - airflow variables -s "surveygizmo_daily_attitudes_survey_id" "12345" - airflow variables -s "surveygizmo_api_token" "tokentokentoken" - airflow variables -s "surveygizmo_api_secret" "tapsekret" - airflow variables -s "jetstream_cluster_ip" "127.0.0.1" - airflow variables -s "jetstream_cluster_cert" "cert" - - airflow variables -s "taar_bigtable_instance_id" "taar_bigtable_instance_id" - airflow variables -s "taar_etl_storage_bucket" "taar_etl_storage_bucket" - airflow variables -s "taar_etl_model_storage_bucket" "taar_etl_model_storage_bucket" - airflow variables -s "taar_gcp_project_id" "taar_gcp_project_id" - airflow variables -s "taar_dataflow_subnetwork" "taar_dataflow_subnetwork" - airflow variables -s "taar_dataflow_service_account_email" "taar_dataflow_service_account_email" - - airflow variables -s "looker_repos_secret_git_ssh_key_b64" "looker_repos_secret_git_ssh_key_b64" - airflow variables -s "looker_api_client_id_staging" "looker_api_client_id_staging" - airflow variables -s "looker_api_client_secret_staging" "looker_api_client_secret_staging" - airflow variables -s "looker_api_client_id_prod" "looker_api_client_id_prod" - airflow variables -s "looker_api_client_secret_prod" "looker_api_client_secret_prod" - airflow variables -s "dataops_looker_github_secret_access_token" "dataops_looker_github_secret_access_token" - - airflow variables -s "glean_dictionary_netlify_build_webhook_id" "status/200" - airflow variables -s "lookml_generator_release_str" "v0.0.0" + airflow variables set "bugzilla_probe_expiry_bot_api_key" "bugzilla-api-key" + airflow variables set "app_store_connect_username" "username" + airflow variables set "app_store_connect_password" "password" + airflow variables set "surveygizmo_daily_attitudes_survey_id" "12345" + airflow variables set "surveygizmo_api_token" "tokentokentoken" + airflow variables set "surveygizmo_api_secret" "tapsekret" + airflow variables set "jetstream_cluster_ip" "127.0.0.1" + airflow variables set "jetstream_cluster_cert" "cert" + + airflow variables set "taar_bigtable_instance_id" "taar_bigtable_instance_id" + airflow variables set "taar_etl_storage_bucket" "taar_etl_storage_bucket" + airflow variables set "taar_etl_model_storage_bucket" "taar_etl_model_storage_bucket" + airflow variables set "taar_gcp_project_id" "taar_gcp_project_id" + airflow variables set "taar_dataflow_subnetwork" "taar_dataflow_subnetwork" + airflow variables set "taar_dataflow_service_account_email" "taar_dataflow_service_account_email" + + airflow variables set "looker_repos_secret_git_ssh_key_b64" "looker_repos_secret_git_ssh_key_b64" + airflow variables set "looker_api_client_id_staging" "looker_api_client_id_staging" + airflow variables set "looker_api_client_secret_staging" "looker_api_client_secret_staging" + airflow variables set "looker_api_client_id_prod" "looker_api_client_id_prod" + airflow variables set "looker_api_client_secret_prod" "looker_api_client_secret_prod" + airflow variables set "dataops_looker_github_secret_access_token" "dataops_looker_github_secret_access_token" + + airflow variables set "glean_dictionary_netlify_build_webhook_id" "status/200" + airflow variables set "lookml_generator_release_str" "v0.0.0" } [ $# -lt 1 ] && usage diff --git a/bin/start_gke b/bin/start_gke index 5cd0bf785..c398999ce 100755 --- a/bin/start_gke +++ b/bin/start_gke @@ -44,10 +44,10 @@ fi CONTAINER_ID=$(docker ps | grep _web | cut -d' ' -f1) echo "Web container id is $CONTAINER_ID. Adding gcp connection..." -docker exec $CONTAINER_ID airflow connections -d --conn_id $GCP_CONN_ID +docker exec $CONTAINER_ID airflow connections delete $GCP_CONN_ID -docker exec $CONTAINER_ID airflow connections -a --conn_id $GCP_CONN_ID \ - --conn_type google_cloud_platform \ - --conn_extra "$JSON_CREDS" +docker exec $CONTAINER_ID airflow connections add $GCP_CONN_ID \ + --conn-type google_cloud_platform \ + --conn-extra "$JSON_CREDS" echo "visit https://go.corp.mozilla.com/wtmodev for more info" diff --git a/bin/test-parse b/bin/test-parse index 2d5fd69cf..ff7ff716d 100755 --- a/bin/test-parse +++ b/bin/test-parse @@ -44,7 +44,7 @@ function get_errors_in_listing { # Parse the logs for ERROR messages, these typically correspond to python # exceptions in the DAG. In general, there should NOT be any errors when # runnning the local environment. - docker-compose exec web airflow dags list | grep "ERROR" + docker-compose exec web airflow dags list -v | grep "ERROR" } @@ -77,7 +77,7 @@ function main() { if [[ $num_errors -ne 0 && $TESTING -eq 0 ]]; then # Print full error output - docker-compose exec web airflow list_dags + docker-compose exec web airflow dags list -v echo "Failure!" exit 1 elif [[ $TESTING -eq 1 ]]; then diff --git a/config/airflow_local_settings.py b/config/airflow_local_settings.py index deb6c2b33..20c36fe90 100644 --- a/config/airflow_local_settings.py +++ b/config/airflow_local_settings.py @@ -1,3 +1,8 @@ +import gevent +from gevent import monkey, pool + +monkey.patch_all() + STATE_COLORS = { "queued": 'gray', "running": 'lime', diff --git a/dags/.airflowignore b/dags/.airflowignore new file mode 100644 index 000000000..e69de29bb diff --git a/dags/adjust_import.py b/dags/adjust_import.py index 0dff5e739..d6e13bad9 100644 --- a/dags/adjust_import.py +++ b/dags/adjust_import.py @@ -3,7 +3,7 @@ from datetime import datetime, timedelta from airflow import DAG -from airflow.operators.sensors import ExternalTaskSensor +from airflow.sensors.external_task import ExternalTaskSensor from airflow.operators.subdag_operator import SubDagOperator from utils.dataproc import ( moz_dataproc_pyspark_runner, diff --git a/dags/bhr_collection.py b/dags/bhr_collection.py index c0780c279..c92baf174 100644 --- a/dags/bhr_collection.py +++ b/dags/bhr_collection.py @@ -1,7 +1,7 @@ import datetime from airflow import DAG -from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook from operators.task_sensor import ExternalTaskCompletedSensor from airflow.operators.subdag_operator import SubDagOperator @@ -29,7 +29,7 @@ ) as dag: # Jobs read from/write to s3://telemetry-public-analysis-2/bhr/data/hang_aggregates/ write_aws_conn_id = 'aws_dev_telemetry_public_analysis_2_rw' - aws_access_key, aws_secret_key, _ = AwsHook(write_aws_conn_id).get_credentials() + aws_access_key, aws_secret_key, _ = AwsBaseHook(aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials() wait_for_bhr_ping = ExternalTaskCompletedSensor( task_id="wait_for_bhr_ping", diff --git a/dags/burnham.py b/dags/burnham.py index bb3b81fdd..ad27b06a6 100644 --- a/dags/burnham.py +++ b/dags/burnham.py @@ -10,13 +10,12 @@ import time from airflow import DAG -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from airflow.operators import PythonOperator +from airflow.operators.python import PythonOperator from operators.bq_sensor import BigQuerySQLSensorOperator from operators.gcp_container_operator import GKEPodOperator DOCS = """\ -# burnham 👩‍🚀📈🤖 +# burnham The burnham project is an end-to-end test suite that aims to automatically verify that Glean-based products correctly measure, collect, and submit @@ -359,6 +358,7 @@ # GCP and GKE default values DEFAULT_GCP_CONN_ID = "google_cloud_derived_datasets" +DEFAULT_GCP_PROJECT_ID = "moz-fx-data-derived-datasets" DEFAULT_GKE_LOCATION = "us-central1-a" DEFAULT_GKE_CLUSTER_NAME = "bq-load-gke-1" DEFAULT_GKE_NAMESPACE = "default" @@ -420,7 +420,7 @@ def burnham_run( return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, - project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, + project_id=DEFAULT_GCP_PROJECT_ID, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, @@ -446,7 +446,7 @@ def burnham_sensor(task_id, sql, gcp_conn_id=DEFAULT_GCP_CONN_ID, **kwargs): return BigQuerySQLSensorOperator( task_id=task_id, sql=sql, - bigquery_conn_id=gcp_conn_id, + gcp_conn_id=gcp_conn_id, use_legacy_sql=False, **kwargs, ) @@ -483,7 +483,7 @@ def burnham_bigquery_run( return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, - project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, + project_id=DEFAULT_GCP_PROJECT_ID, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, diff --git a/dags/clean_gke_pods.py b/dags/clean_gke_pods.py index f96e3122e..36fd4c34a 100644 --- a/dags/clean_gke_pods.py +++ b/dags/clean_gke_pods.py @@ -2,8 +2,6 @@ from datetime import timedelta, datetime from operators.gcp_container_operator import GKEPodOperator -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook - docs = """ ### Clean GKE Pods diff --git a/dags/copy_deduplicate.py b/dags/copy_deduplicate.py index fa03aa4aa..881ef26c5 100644 --- a/dags/copy_deduplicate.py +++ b/dags/copy_deduplicate.py @@ -1,7 +1,7 @@ import datetime from airflow import models -from airflow.operators.sensors import ExternalTaskSensor +from airflow.sensors.external_task import ExternalTaskSensor from airflow.operators.subdag_operator import SubDagOperator from utils.gcp import ( bigquery_etl_copy_deduplicate, @@ -10,7 +10,6 @@ bigquery_xcom_query, ) -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook from utils.gcp import gke_command DOCS = """\ diff --git a/dags/crash_symbolication.py b/dags/crash_symbolication.py index 5d7f21f5d..028c7285e 100644 --- a/dags/crash_symbolication.py +++ b/dags/crash_symbolication.py @@ -1,7 +1,7 @@ import datetime from airflow import DAG -from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook from operators.task_sensor import ExternalTaskCompletedSensor from airflow.operators.subdag_operator import SubDagOperator @@ -35,13 +35,15 @@ ) as dag: # top_signatures_correlations uploads results to public analysis bucket write_aws_conn_id = "aws_dev_telemetry_public_analysis_2_rw" - analysis_access_key, analysis_secret_key, _ = AwsHook( - write_aws_conn_id + analysis_access_key, analysis_secret_key, _ = AwsBaseHook( + aws_conn_id=write_aws_conn_id, + client_type='s3' ).get_credentials() # modules_with_missing_symbols sends results as email ses_aws_conn_id = "aws_data_iam_ses" - ses_access_key, ses_secret_key, _ = AwsHook(ses_aws_conn_id).get_credentials() + ses_access_key, ses_secret_key, _ = AwsBaseHook( + aws_conn_id=ses_aws_conn_id, client_type='s3').get_credentials() wait_for_socorro_import = ExternalTaskCompletedSensor( task_id="wait_for_socorro_import", diff --git a/dags/experiments_live.py b/dags/experiments_live.py index e4025bd49..1367642ef 100644 --- a/dags/experiments_live.py +++ b/dags/experiments_live.py @@ -3,8 +3,7 @@ from utils.gcp import bigquery_etl_query, gke_command -from airflow.operators.sensors import ExternalTaskSensor -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook +from airflow.sensors.external_task import ExternalTaskSensor from operators.gcp_container_operator import GKEPodOperator default_args = { @@ -18,6 +17,8 @@ # We rely on max_active_runs=1 at the DAG level to manage the dependency on past runs. with DAG('experiments_live', default_args=default_args, + # Will be renamed to max_active_tasks sometime later as main upstream branch states + # max_active_tasks=4, concurrency=4, max_active_runs=1, schedule_interval="*/5 * * * *") as dag: diff --git a/dags/firefox_public_data_report.py b/dags/firefox_public_data_report.py index df16f8e05..7e93fd3d4 100644 --- a/dags/firefox_public_data_report.py +++ b/dags/firefox_public_data_report.py @@ -1,5 +1,5 @@ from airflow import DAG -from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook from operators.task_sensor import ExternalTaskCompletedSensor from airflow.operators.subdag_operator import SubDagOperator from datetime import datetime, timedelta @@ -35,7 +35,7 @@ # Required to write json output to s3://telemetry-public-analysis-2/public-data-report/hardware/ write_aws_conn_id='aws_dev_telemetry_public_analysis_2_rw' -aws_access_key, aws_secret_key, session = AwsHook(write_aws_conn_id).get_credentials() +aws_access_key, aws_secret_key, session = AwsBaseHook(aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials() # hardware_report's execution date will be {now}-7days. It will read last week's main pings, # therefore we need to wait for yesterday's Main Ping deduplication task to finish diff --git a/dags/glam.py b/dags/glam.py index 892d63e77..396e5a1a0 100644 --- a/dags/glam.py +++ b/dags/glam.py @@ -1,8 +1,6 @@ from datetime import datetime, timedelta from airflow import DAG -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from airflow.executors import get_default_executor from operators.task_sensor import ExternalTaskCompletedSensor from airflow.operators.subdag_operator import SubDagOperator @@ -38,12 +36,9 @@ dag = DAG(GLAM_DAG, default_args=default_args, schedule_interval="0 2 * * *") -gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc") - # Make sure all the data for the given day has arrived before running. wait_for_main_ping = ExternalTaskCompletedSensor( task_id="wait_for_main_ping", - project_id=project_id, external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_main_ping", execution_delta=timedelta(hours=1), @@ -181,7 +176,6 @@ dataset_id, ), task_id=GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG, - executor=get_default_executor(), dag=dag, ) @@ -236,6 +230,10 @@ # SubdagOperator uses a SequentialExecutor by default # so its tasks will run sequentially. +# Note: In 2.0, SubDagOperator is changed to use airflow scheduler instead of +# backfill to schedule tasks in the subdag. User no longer need to specify +# the executor in SubDagOperator. (We don't but the assumption that Sequential +# Executor is used is now wrong) clients_histogram_bucket_counts = SubDagOperator( subdag=repeated_subdag( GLAM_DAG, @@ -273,7 +271,6 @@ "counts" ), task_id="extract_user_counts", - executor=get_default_executor(), dag=dag ) @@ -288,7 +285,6 @@ "sample-counts" ), task_id="extract_sample_counts", - executor=get_default_executor(), dag=dag ) @@ -301,7 +297,6 @@ dataset_id ), task_id="extracts", - executor=get_default_executor(), dag=dag, ) diff --git a/dags/glam_subdags/extract.py b/dags/glam_subdags/extract.py index 3690a4a79..71ea1925c 100644 --- a/dags/glam_subdags/extract.py +++ b/dags/glam_subdags/extract.py @@ -1,15 +1,11 @@ -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator -from airflow.contrib.operators.gcs_delete_operator import ( - GoogleCloudStorageDeleteOperator, -) -from airflow.executors import get_default_executor +from airflow.providers.google.cloud.transfers.bigquery_to_gcs import BigQueryToGCSOperator +from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator from airflow.operators.subdag_operator import SubDagOperator from airflow.models import DAG from utils.gcp import bigquery_etl_query -gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc") +gcp_conn_id = "google_cloud_airflow_dataproc" project_id = "moz-fx-data-shared-prod" glam_bucket = "moz-fx-data-glam-prod-fca7-etl-data" @@ -33,7 +29,6 @@ def extracts_subdag( channel, ), task_id="extract_{}".format(channel), - executor=get_default_executor(), dag=dag, ) @@ -75,24 +70,24 @@ def extract_channel_subdag( dag=dag, ) - gcs_delete = GoogleCloudStorageDeleteOperator( + gcs_delete = GCSDeleteObjectsOperator( task_id="glam_gcs_delete_old_{}_extracts".format(channel), bucket_name=glam_bucket, prefix="aggs-desktop-{}".format(channel), - google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, + gcp_conn_id=gcp_conn_id, dag=dag, ) gcs_destination = "gs://{bucket}/aggs-desktop-{channel}-*.csv".format( bucket=glam_bucket, channel=channel ) - bq2gcs = BigQueryToCloudStorageOperator( + bq2gcs = BigQueryToGCSOperator( task_id="glam_extract_{}_to_csv".format(channel), source_project_dataset_table="{}.{}.{}".format( project_id, dataset_id, bq_extract_table ), destination_cloud_storage_uris=gcs_destination, - bigquery_conn_id=gcp_conn.gcp_conn_id, + gcp_conn_id=gcp_conn_id, export_format="CSV", print_header=False, dag=dag, @@ -135,11 +130,13 @@ def extract_user_counts( dag=dag, ) - gcs_delete = GoogleCloudStorageDeleteOperator( + + gcs_delete = GCSDeleteObjectsOperator( task_id="glam_gcs_delete_{}_extracts".format(task_prefix), bucket_name=glam_bucket, + prefix="glam-extract-firefox-{}".format(file_prefix), - google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, + gcp_conn_id=gcp_conn_id, dag=dag, ) @@ -151,13 +148,14 @@ def extract_user_counts( gcs_destination = "gs://{}/glam-extract-firefox-{}.csv".format( glam_bucket, file_prefix ) - bq2gcs = BigQueryToCloudStorageOperator( + + bq2gcs = BigQueryToGCSOperator( task_id="glam_extract_{}_to_csv".format(task_prefix), source_project_dataset_table="{}.{}.{}".format( project_id, dataset_id, bq_extract_table ), destination_cloud_storage_uris=gcs_destination, - bigquery_conn_id=gcp_conn.gcp_conn_id, + gcp_conn_id=gcp_conn_id, export_format="CSV", print_header=False, dag=dag, diff --git a/dags/glam_subdags/histograms.py b/dags/glam_subdags/histograms.py index b11d654e3..dbb4c3406 100644 --- a/dags/glam_subdags/histograms.py +++ b/dags/glam_subdags/histograms.py @@ -1,6 +1,5 @@ from airflow.models import DAG from airflow.operators.subdag_operator import SubDagOperator -from airflow.executors import get_default_executor from glam_subdags.general import repeated_subdag from utils.gcp import bigquery_etl_query @@ -42,7 +41,6 @@ def histogram_aggregates_subdag( dataset_id, ), task_id=GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG, - executor=get_default_executor(), dag=dag, ) diff --git a/dags/graphics_telemetry.py b/dags/graphics_telemetry.py index a94763afe..d5c14dd7e 100644 --- a/dags/graphics_telemetry.py +++ b/dags/graphics_telemetry.py @@ -1,7 +1,8 @@ import datetime +import os from airflow import DAG -from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook from operators.task_sensor import ExternalTaskCompletedSensor from airflow.operators.subdag_operator import SubDagOperator @@ -39,7 +40,11 @@ ) as dag: # Jobs read from/write to s3://telemetry-public-analysis-2/gfx/telemetry-data/ write_aws_conn_id = 'aws_dev_telemetry_public_analysis_2_rw' - aws_access_key, aws_secret_key, _ = AwsHook(write_aws_conn_id).get_credentials() + is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev" + if is_dev: + aws_access_key, aws_secret_key = ('replace_me', 'replace_me') + else: + aws_access_key, aws_secret_key, _ = AwsBaseHook(aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials() wait_for_main_ping = ExternalTaskCompletedSensor( task_id="wait_for_main_ping", diff --git a/dags/incline_dash.py b/dags/incline_dash.py deleted file mode 100644 index 58cec6542..000000000 --- a/dags/incline_dash.py +++ /dev/null @@ -1,88 +0,0 @@ -from airflow import DAG -from datetime import datetime, timedelta - -from utils.gcp import bigquery_etl_query - -from operators.task_sensor import ExternalTaskCompletedSensor -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from operators.gcp_container_operator import GKEPodOperator - -default_args = { - 'owner': 'frank@mozilla.com', - 'depends_on_past': False, - 'start_date': datetime(2020, 1, 1), - 'email_on_failure': True, - 'email_on_retry': True, - 'retries': 2, - 'retry_delay': timedelta(minutes=30), -} - - -with DAG('incline_dashboard', - default_args=default_args, - schedule_interval="0 4 * * *") as dag: - - wait_for_baseline_clients_last_seen = ExternalTaskCompletedSensor( - task_id="wait_for_baseline_clients_last_seen", - external_dag_id="copy_deduplicate", - external_task_id="baseline_clients_last_seen", - execution_delta=timedelta(hours=3), - mode="reschedule", - pool="DATA_ENG_EXTERNALTASKSENSOR", - email_on_retry=False, - ) - - wait_for_core_clients_last_seen = ExternalTaskCompletedSensor( - task_id="wait_for_core_clients_last_seen", - external_dag_id="bqetl_core", - external_task_id="telemetry_derived__core_clients_last_seen__v1", - execution_delta=timedelta(hours=2), - mode="reschedule", - pool="DATA_ENG_EXTERNALTASKSENSOR", - email_on_retry=False, - ) - - project = "moz-fx-data-shared-prod" - dataset = "org_mozilla_firefox_derived" - - migrated_clients = bigquery_etl_query( - task_id="generate_migrated_clients", - project_id=project, - dataset_id=dataset, - # We recreate this entire table from scratch every day because we are - # taking the last seen migration ping over all time for each client. - destination_table=None, - date_partition_parameter=None, - sql_file_path="sql/moz-fx-data-shared-prod/org_mozilla_firefox_derived/migrated_clients_v1/init.sql", - owner="frank@mozilla.com", - email=["telemetry-alerts@mozilla.com", "frank@mozilla.com"] - ) - - exec_dash = bigquery_etl_query( - task_id="generate_incline_exec_dash", - destination_table="incline_executive_v1", - project_id=project, - dataset_id=dataset, - owner="frank@mozilla.com", - email=["telemetry-alerts@mozilla.com", "frank@mozilla.com"], - ) - - gcp_conn_id = 'google_cloud_derived_datasets' - export_incline_dash = GKEPodOperator( - task_id="export_incline_dash", - name="export-incline-dash", - arguments=["script/export_incline_dash", "{{ ds }}"], - gcp_conn_id=gcp_conn_id, - project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, - location="us-central1-a", - cluster_name="bq-load-gke-1", - namespace="default", - image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest", - ) - - ( - [wait_for_baseline_clients_last_seen, wait_for_core_clients_last_seen] >> - migrated_clients >> - exec_dash >> - export_incline_dash - ) diff --git a/dags/ltv.py b/dags/ltv.py index fffce0dab..f749efa1b 100644 --- a/dags/ltv.py +++ b/dags/ltv.py @@ -5,7 +5,10 @@ from operators.task_sensor import ExternalTaskCompletedSensor from airflow.operators.subdag_operator import SubDagOperator from datetime import datetime, timedelta -from operators.backport.bigquery_operator_1_10_2 import BigQueryOperator + +from airflow.providers.google.cloud.operators.bigquery import ( + BigQueryExecuteQueryOperator +) from six.moves.urllib.request import urlopen from utils.dataproc import ( moz_dataproc_pyspark_runner, @@ -109,8 +112,7 @@ 'https://raw.githubusercontent.com/mozilla/bigquery-etl/main/sql', 'moz-fx-data-shared-prod', 'revenue_derived', 'client_ltv_v1', 'query.sql'])) -BigQueryOperator.template_fields += ('query_params',) -ltv_revenue_join=BigQueryOperator( +ltv_revenue_join=BigQueryExecuteQueryOperator( task_id='ltv_revenue_join', sql=response.read().decode('utf-8'), query_params=[{"name": "submission_date", "parameterType": {"type": "DATE"}, "parameterValue": {"value": "{{ ds }}"}}], @@ -129,7 +131,7 @@ 'moz-fx-data-shared-prod', 'revenue_derived', 'client_ltv_normalized', 'query.sql'])) # Normalized LTV View is for general-use and doesn't contain any revenue data -ltv_normalized_view=BigQueryOperator( +ltv_normalized_view=BigQueryExecuteQueryOperator( task_id='ltv_normalized_view', sql=response.read().decode('utf-8'), query_params=[{"name": "submission_date", "parameterType": {"type": "DATE"}, "parameterValue": {"value": "{{ ds }}"}}], @@ -147,7 +149,7 @@ 'https://raw.githubusercontent.com/mozilla/bigquery-etl/main/sql', 'moz-fx-data-shared-prod', 'revenue_derived', 'client_ltv_normalized_v1', 'query.sql'])) -client_ltv_normalized_v1=BigQueryOperator( +client_ltv_normalized_v1=BigQueryExecuteQueryOperator( task_id='client_ltv_normalized_v1', sql=response.read().decode('utf-8'), query_params=[{"name": "submission_date", "parameterType": {"type": "DATE"}, "parameterValue": {"value": "{{ ds }}"}}], diff --git a/dags/mad_server.py b/dags/mad_server.py index 8212bd3d1..60a89ba7e 100644 --- a/dags/mad_server.py +++ b/dags/mad_server.py @@ -1,7 +1,8 @@ +import os from airflow import DAG from datetime import datetime, timedelta -from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook from utils.gcp import gke_command @@ -17,17 +18,22 @@ } with DAG("mad_server", default_args=default_args, schedule_interval="@weekly") as dag: - + is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev" aws_conn_id="aws_dev_mad_resources_training" - # mad-server expects AWS creds in some custom env vars. - s3_env_vars = { - key: value - for key, value in zip( - ("S3_ACCESS_KEY_ID", "S3_SECRET_ACCESS_KEY", "S3_SESSION_TOKEN"), - AwsHook(aws_conn_id).get_credentials() if aws_conn_id else (), - ) - if value is not None} + if is_dev: + aws_conn_id = None + s3_env_vars = {} + else: + aws_conn_id="aws_dev_mad_resources_training" + s3_env_vars = { + key: value + for key, value in zip( + ("S3_ACCESS_KEY_ID", "S3_SECRET_ACCESS_KEY", "S3_SESSION_TOKEN"), + AwsBaseHook(aws_conn_id=aws_conn_id, client_type='s3').get_credentials() if aws_conn_id else (), + ) + if value is not None + } mad_server_pull = gke_command( task_id="mad_server_pull", diff --git a/dags/mozaggregator_mobile.py b/dags/mozaggregator_mobile.py index 4299a0874..dac5262c7 100644 --- a/dags/mozaggregator_mobile.py +++ b/dags/mozaggregator_mobile.py @@ -3,8 +3,7 @@ from datetime import datetime, timedelta from airflow import DAG -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from airflow.contrib.operators.gcs_delete_operator import GoogleCloudStorageDeleteOperator +from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator from airflow.operators.subdag_operator import SubDagOperator @@ -34,13 +33,13 @@ subdag_args["retries"] = 0 task_id = "mobile_aggregate_view_dataproc" -gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc") -keyfile = json.loads(gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"]) -project_id = keyfile["project_id"] +gcp_conn_id = "google_cloud_airflow_dataproc" +project_id = "airflow-dataproc" +dev_test_service_account = "replace_me" is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev" client_email = ( - keyfile["client_email"] + dev_test_service_account if is_dev else "dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com" ) @@ -100,7 +99,7 @@ "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/mobile/moz-fx-data-shared-prod", ] ), - gcp_conn_id=gcp_conn.gcp_conn_id, + gcp_conn_id=gcp_conn_id, service_account=client_email, artifact_bucket=artifact_bucket, storage_bucket=storage_bucket, @@ -126,11 +125,11 @@ dag=dag, ).set_downstream(mobile_aggregate_view_dataproc) - GoogleCloudStorageDeleteOperator( + GCSDeleteObjectsOperator( task_id="delete_mobile_metrics_avro", bucket_name="moz-fx-data-derived-datasets-parquet-tmp", prefix="avro/mozaggregator/mobile/moz-fx-data-shared-prod/{{ ds_nodash }}/mobile_metrics_v1", - google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, + gcp_conn_id=gcp_conn_id, dag=dag ).set_upstream(mobile_aggregate_view_dataproc) diff --git a/dags/mozaggregator_prerelease.py b/dags/mozaggregator_prerelease.py index 3ccc3b184..ec95a8118 100644 --- a/dags/mozaggregator_prerelease.py +++ b/dags/mozaggregator_prerelease.py @@ -3,10 +3,7 @@ from datetime import datetime, timedelta from airflow import DAG -from airflow.contrib.operators.gcs_delete_operator import ( - GoogleCloudStorageDeleteOperator, -) -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook +from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator from airflow.operators.subdag_operator import SubDagOperator from utils.dataproc import moz_dataproc_pyspark_runner, copy_artifacts_dev from utils.gcp import gke_command @@ -39,13 +36,13 @@ subdag_args["retries"] = 0 task_id = "prerelease_telemetry_aggregate_view_dataproc" -gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc") -keyfile = json.loads(gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"]) -project_id = keyfile["project_id"] +gcp_conn_id = "google_cloud_airflow_dataproc" +project_id = "airflow-dataproc" +dev_test_service_account = "replace_me" is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev" client_email = ( - keyfile["client_email"] + dev_test_service_account if is_dev else "dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com" ) @@ -114,7 +111,7 @@ "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease/moz-fx-data-shared-prod", ] ), - gcp_conn_id=gcp_conn.gcp_conn_id, + gcp_conn_id=gcp_conn_id, service_account=client_email, artifact_bucket=artifact_bucket, storage_bucket=storage_bucket, @@ -207,11 +204,11 @@ ).set_downstream(prerelease_telemetry_aggregate_view_dataproc) # Delete the GCS data - GoogleCloudStorageDeleteOperator( + GCSDeleteObjectsOperator( task_id="delete_main_avro", bucket_name="moz-fx-data-derived-datasets-parquet-tmp", prefix="avro/mozaggregator/prerelease/moz-fx-data-shared-prod/{{ ds_nodash }}/main_v4", - google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, + gcp_conn_id=gcp_conn_id, dag=dag, ).set_upstream(prerelease_telemetry_aggregate_view_dataproc) diff --git a/dags/mozaggregator_release.py b/dags/mozaggregator_release.py deleted file mode 100644 index 991645f46..000000000 --- a/dags/mozaggregator_release.py +++ /dev/null @@ -1,30 +0,0 @@ -from airflow import DAG -from datetime import datetime, timedelta - -from airflow.operators.dummy_operator import DummyOperator - -default_args = { - "owner": "frank@mozilla.com", - "depends_on_past": True, - "start_date": datetime(2018, 12, 17), - "email": ["telemetry-alerts@mozilla.com", "frank@mozilla.com"], - "email_on_failure": True, - "email_on_retry": True, - "retries": 3, - "retry_delay": timedelta(minutes=30), -} - -dag = DAG( - "release_telemetry_aggregates", - default_args=default_args, - schedule_interval="@daily", -) - -# See mozaggregator_prerelease and mozaggregator_mobile for functional -# implementations using dataproc operator. This is not implemented due to the -# migration to GCP and https://bugzilla.mozilla.org/show_bug.cgi?id=1517018 -release_telemetry_aggregate_view = DummyOperator( - task_id="release_telemetry_aggregate_view", - job_name="Release Telemetry Aggregate View", - dag=dag, -) diff --git a/dags/operators/backport/README.md b/dags/operators/backport/README.md index 2e439c1f0..be8a40ac4 100644 --- a/dags/operators/backport/README.md +++ b/dags/operators/backport/README.md @@ -7,7 +7,11 @@ the upstream GkePodOperator works fine. ### As of 1.10.12 I've removed the backported 1.10.7 gcp_container_operator, kubernetes_pod_operator, and the 1.10.2 kube_client + ### Fivetran operator backported from 2.0+ Fivetran provides and [operator, sensor and hook](https://github.com/fivetran/airflow-provider-fivetran) for integrating with the Fivetran API for Airflow version 2.0+. Backported to make it usable in Airflow 1.10.15. + +### For 2.1.0 I've removed bigquery_operator_1_10_2.py, in favor of the new +google provider code. diff --git a/dags/operators/backport/bigquery_operator_1_10_2.py b/dags/operators/backport/bigquery_operator_1_10_2.py deleted file mode 100644 index 085c39c15..000000000 --- a/dags/operators/backport/bigquery_operator_1_10_2.py +++ /dev/null @@ -1,612 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import json - -from airflow.contrib.hooks.bigquery_hook import BigQueryHook -from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook, _parse_gcs_url -from airflow.models import BaseOperator -from airflow.utils.decorators import apply_defaults - - -class BigQueryOperator(BaseOperator): - """ - Executes BigQuery SQL queries in a specific BigQuery database - :param bql: (Deprecated. Use `sql` parameter instead) the sql code to be - executed (templated) - :type bql: Can receive a str representing a sql statement, - a list of str (sql statements), or reference to a template file. - Template reference are recognized by str ending in '.sql'. - :param sql: the sql code to be executed (templated) - :type sql: Can receive a str representing a sql statement, - a list of str (sql statements), or reference to a template file. - Template reference are recognized by str ending in '.sql'. - :param destination_dataset_table: A dotted - (.|:). that, if set, will store the results - of the query. (templated) - :type destination_dataset_table: string - :param write_disposition: Specifies the action that occurs if the destination table - already exists. (default: 'WRITE_EMPTY') - :type write_disposition: string - :param create_disposition: Specifies whether the job is allowed to create new tables. - (default: 'CREATE_IF_NEEDED') - :type create_disposition: string - :param allow_large_results: Whether to allow large results. - :type allow_large_results: boolean - :param flatten_results: If true and query uses legacy SQL dialect, flattens - all nested and repeated fields in the query results. ``allow_large_results`` - must be ``true`` if this is set to ``false``. For standard SQL queries, this - flag is ignored and results are never flattened. - :type flatten_results: boolean - :param bigquery_conn_id: reference to a specific BigQuery hook. - :type bigquery_conn_id: string - :param delegate_to: The account to impersonate, if any. - For this to work, the service account making the request must have domain-wide - delegation enabled. - :type delegate_to: string - :param udf_config: The User Defined Function configuration for the query. - See https://cloud.google.com/bigquery/user-defined-functions for details. - :type udf_config: list - :param use_legacy_sql: Whether to use legacy SQL (true) or standard SQL (false). - :type use_legacy_sql: boolean - :param maximum_billing_tier: Positive integer that serves as a multiplier - of the basic price. - Defaults to None, in which case it uses the value set in the project. - :type maximum_billing_tier: integer - :param maximum_bytes_billed: Limits the bytes billed for this job. - Queries that will have bytes billed beyond this limit will fail - (without incurring a charge). If unspecified, this will be - set to your project default. - :type maximum_bytes_billed: float - :param api_resource_configs: a dictionary that contain params - 'configuration' applied for Google BigQuery Jobs API: - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs - for example, {'query': {'useQueryCache': False}}. You could use it - if you need to provide some params that are not supported by BigQueryOperator - like args. - :type api_resource_configs: dict - :param schema_update_options: Allows the schema of the destination - table to be updated as a side effect of the load job. - :type schema_update_options: tuple - :param query_params: a dictionary containing query parameter types and - values, passed to BigQuery. - :type query_params: dict - :param labels: a dictionary containing labels for the job/query, - passed to BigQuery - :type labels: dict - :param priority: Specifies a priority for the query. - Possible values include INTERACTIVE and BATCH. - The default value is INTERACTIVE. - :type priority: string - :param time_partitioning: configure optional time partitioning fields i.e. - partition by field, type and expiration as per API specifications. - :type time_partitioning: dict - :param cluster_fields: Request that the result of this query be stored sorted - by one or more columns. This is only available in conjunction with - time_partitioning. The order of columns given determines the sort order. - :type cluster_fields: list of str - :param location: The geographic location of the job. Required except for - US and EU. See details at - https://cloud.google.com/bigquery/docs/locations#specifying_your_location - :type location: str - """ - - template_fields = ('bql', 'sql', 'destination_dataset_table', 'labels') - template_ext = ('.sql', ) - ui_color = '#e4f0e8' - - @apply_defaults - def __init__(self, - bql=None, - sql=None, - destination_dataset_table=None, - write_disposition='WRITE_EMPTY', - allow_large_results=False, - flatten_results=None, - bigquery_conn_id='bigquery_default', - delegate_to=None, - udf_config=None, - use_legacy_sql=True, - maximum_billing_tier=None, - maximum_bytes_billed=None, - create_disposition='CREATE_IF_NEEDED', - schema_update_options=(), - query_params=None, - labels=None, - priority='INTERACTIVE', - time_partitioning=None, - api_resource_configs=None, - cluster_fields=None, - location=None, - *args, - **kwargs): - super(BigQueryOperator, self).__init__(*args, **kwargs) - self.bql = bql - self.sql = sql if sql else bql - self.destination_dataset_table = destination_dataset_table - self.write_disposition = write_disposition - self.create_disposition = create_disposition - self.allow_large_results = allow_large_results - self.flatten_results = flatten_results - self.bigquery_conn_id = bigquery_conn_id - self.delegate_to = delegate_to - self.udf_config = udf_config - self.use_legacy_sql = use_legacy_sql - self.maximum_billing_tier = maximum_billing_tier - self.maximum_bytes_billed = maximum_bytes_billed - self.schema_update_options = schema_update_options - self.query_params = query_params - self.labels = labels - self.bq_cursor = None - self.priority = priority - self.time_partitioning = time_partitioning - self.api_resource_configs = api_resource_configs - self.cluster_fields = cluster_fields - self.location = location - - # TODO remove `bql` in Airflow 2.0 - if self.bql: - import warnings - warnings.warn('Deprecated parameter `bql` used in Task id: {}. ' - 'Use `sql` parameter instead to pass the sql to be ' - 'executed. `bql` parameter is deprecated and ' - 'will be removed in a future version of ' - 'Airflow.'.format(self.task_id), - category=DeprecationWarning) - - if self.sql is None: - raise TypeError('{} missing 1 required positional ' - 'argument: `sql`'.format(self.task_id)) - - def execute(self, context): - if self.bq_cursor is None: - self.log.info('Executing: %s', self.sql) - hook = BigQueryHook( - bigquery_conn_id=self.bigquery_conn_id, - use_legacy_sql=self.use_legacy_sql, - delegate_to=self.delegate_to, - location=self.location, - ) - conn = hook.get_conn() - self.bq_cursor = conn.cursor() - self.bq_cursor.run_query( - sql=self.sql, - destination_dataset_table=self.destination_dataset_table, - write_disposition=self.write_disposition, - allow_large_results=self.allow_large_results, - flatten_results=self.flatten_results, - udf_config=self.udf_config, - maximum_billing_tier=self.maximum_billing_tier, - maximum_bytes_billed=self.maximum_bytes_billed, - create_disposition=self.create_disposition, - query_params=self.query_params, - labels=self.labels, - schema_update_options=self.schema_update_options, - priority=self.priority, - time_partitioning=self.time_partitioning, - api_resource_configs=self.api_resource_configs, - cluster_fields=self.cluster_fields, - ) - - def on_kill(self): - super(BigQueryOperator, self).on_kill() - if self.bq_cursor is not None: - self.log.info('Cancelling running query') - self.bq_cursor.cancel_query() - - -class BigQueryCreateEmptyTableOperator(BaseOperator): - """ - Creates a new, empty table in the specified BigQuery dataset, - optionally with schema. - The schema to be used for the BigQuery table may be specified in one of - two ways. You may either directly pass the schema fields in, or you may - point the operator to a Google cloud storage object name. The object in - Google cloud storage must be a JSON file with the schema fields in it. - You can also create a table without schema. - :param project_id: The project to create the table into. (templated) - :type project_id: string - :param dataset_id: The dataset to create the table into. (templated) - :type dataset_id: string - :param table_id: The Name of the table to be created. (templated) - :type table_id: string - :param schema_fields: If set, the schema field list as defined here: - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema - **Example**: :: - schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}] - :type schema_fields: list - :param gcs_schema_object: Full path to the JSON file containing - schema (templated). For - example: ``gs://test-bucket/dir1/dir2/employee_schema.json`` - :type gcs_schema_object: string - :param time_partitioning: configure optional time partitioning fields i.e. - partition by field, type and expiration as per API specifications. - .. seealso:: - https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#timePartitioning - :type time_partitioning: dict - :param bigquery_conn_id: Reference to a specific BigQuery hook. - :type bigquery_conn_id: string - :param google_cloud_storage_conn_id: Reference to a specific Google - cloud storage hook. - :type google_cloud_storage_conn_id: string - :param delegate_to: The account to impersonate, if any. For this to - work, the service account making the request must have domain-wide - delegation enabled. - :type delegate_to: string - :param labels: a dictionary containing labels for the table, passed to BigQuery - **Example (with schema JSON in GCS)**: :: - CreateTable = BigQueryCreateEmptyTableOperator( - task_id='BigQueryCreateEmptyTableOperator_task', - dataset_id='ODS', - table_id='Employees', - project_id='internal-gcp-project', - gcs_schema_object='gs://schema-bucket/employee_schema.json', - bigquery_conn_id='airflow-service-account', - google_cloud_storage_conn_id='airflow-service-account' - ) - **Corresponding Schema file** (``employee_schema.json``): :: - [ - { - "mode": "NULLABLE", - "name": "emp_name", - "type": "STRING" - }, - { - "mode": "REQUIRED", - "name": "salary", - "type": "INTEGER" - } - ] - **Example (with schema in the DAG)**: :: - CreateTable = BigQueryCreateEmptyTableOperator( - task_id='BigQueryCreateEmptyTableOperator_task', - dataset_id='ODS', - table_id='Employees', - project_id='internal-gcp-project', - schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}], - bigquery_conn_id='airflow-service-account', - google_cloud_storage_conn_id='airflow-service-account' - ) - :type labels: dict - """ - template_fields = ('dataset_id', 'table_id', 'project_id', - 'gcs_schema_object', 'labels') - ui_color = '#f0eee4' - - @apply_defaults - def __init__(self, - dataset_id, - table_id, - project_id=None, - schema_fields=None, - gcs_schema_object=None, - time_partitioning=None, - bigquery_conn_id='bigquery_default', - google_cloud_storage_conn_id='google_cloud_default', - delegate_to=None, - labels=None, - *args, **kwargs): - - super(BigQueryCreateEmptyTableOperator, self).__init__(*args, **kwargs) - - self.project_id = project_id - self.dataset_id = dataset_id - self.table_id = table_id - self.schema_fields = schema_fields - self.gcs_schema_object = gcs_schema_object - self.bigquery_conn_id = bigquery_conn_id - self.google_cloud_storage_conn_id = google_cloud_storage_conn_id - self.delegate_to = delegate_to - self.time_partitioning = {} if time_partitioning is None else time_partitioning - self.labels = labels - - def execute(self, context): - bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, - delegate_to=self.delegate_to) - - if not self.schema_fields and self.gcs_schema_object: - - gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object) - - gcs_hook = GoogleCloudStorageHook( - google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, - delegate_to=self.delegate_to) - schema_fields = json.loads(gcs_hook.download( - gcs_bucket, - gcs_object).decode("utf-8")) - else: - schema_fields = self.schema_fields - - conn = bq_hook.get_conn() - cursor = conn.cursor() - - cursor.create_empty_table( - project_id=self.project_id, - dataset_id=self.dataset_id, - table_id=self.table_id, - schema_fields=schema_fields, - time_partitioning=self.time_partitioning, - labels=self.labels - ) - - -class BigQueryCreateExternalTableOperator(BaseOperator): - """ - Creates a new external table in the dataset with the data in Google Cloud - Storage. - The schema to be used for the BigQuery table may be specified in one of - two ways. You may either directly pass the schema fields in, or you may - point the operator to a Google cloud storage object name. The object in - Google cloud storage must be a JSON file with the schema fields in it. - :param bucket: The bucket to point the external table to. (templated) - :type bucket: string - :param source_objects: List of Google cloud storage URIs to point - table to. (templated) - If source_format is 'DATASTORE_BACKUP', the list must only contain a single URI. - :type source_objects: list - :param destination_project_dataset_table: The dotted (.).
- BigQuery table to load data into (templated). If is not included, - project will be the project defined in the connection json. - :type destination_project_dataset_table: string - :param schema_fields: If set, the schema field list as defined here: - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema - **Example**: :: - schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}] - Should not be set when source_format is 'DATASTORE_BACKUP'. - :type schema_fields: list - :param schema_object: If set, a GCS object path pointing to a .json file that - contains the schema for the table. (templated) - :type schema_object: string - :param source_format: File format of the data. - :type source_format: string - :param compression: [Optional] The compression type of the data source. - Possible values include GZIP and NONE. - The default value is NONE. - This setting is ignored for Google Cloud Bigtable, - Google Cloud Datastore backups and Avro formats. - :type compression: string - :param skip_leading_rows: Number of rows to skip when loading from a CSV. - :type skip_leading_rows: int - :param field_delimiter: The delimiter to use for the CSV. - :type field_delimiter: string - :param max_bad_records: The maximum number of bad records that BigQuery can - ignore when running the job. - :type max_bad_records: int - :param quote_character: The value that is used to quote data sections in a CSV file. - :type quote_character: string - :param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false). - :type allow_quoted_newlines: boolean - :param allow_jagged_rows: Accept rows that are missing trailing optional columns. - The missing values are treated as nulls. If false, records with missing trailing - columns are treated as bad records, and if there are too many bad records, an - invalid error is returned in the job result. Only applicable to CSV, ignored - for other formats. - :type allow_jagged_rows: bool - :param bigquery_conn_id: Reference to a specific BigQuery hook. - :type bigquery_conn_id: string - :param google_cloud_storage_conn_id: Reference to a specific Google - cloud storage hook. - :type google_cloud_storage_conn_id: string - :param delegate_to: The account to impersonate, if any. For this to - work, the service account making the request must have domain-wide - delegation enabled. - :type delegate_to: string - :param src_fmt_configs: configure optional fields specific to the source format - :type src_fmt_configs: dict - :param labels a dictionary containing labels for the table, passed to BigQuery - :type labels: dict - """ - template_fields = ('bucket', 'source_objects', - 'schema_object', 'destination_project_dataset_table', 'labels') - ui_color = '#f0eee4' - - @apply_defaults - def __init__(self, - bucket, - source_objects, - destination_project_dataset_table, - schema_fields=None, - schema_object=None, - source_format='CSV', - compression='NONE', - skip_leading_rows=0, - field_delimiter=',', - max_bad_records=0, - quote_character=None, - allow_quoted_newlines=False, - allow_jagged_rows=False, - bigquery_conn_id='bigquery_default', - google_cloud_storage_conn_id='google_cloud_default', - delegate_to=None, - src_fmt_configs={}, - labels=None, - *args, **kwargs): - - super(BigQueryCreateExternalTableOperator, self).__init__(*args, **kwargs) - - # GCS config - self.bucket = bucket - self.source_objects = source_objects - self.schema_object = schema_object - - # BQ config - self.destination_project_dataset_table = destination_project_dataset_table - self.schema_fields = schema_fields - self.source_format = source_format - self.compression = compression - self.skip_leading_rows = skip_leading_rows - self.field_delimiter = field_delimiter - self.max_bad_records = max_bad_records - self.quote_character = quote_character - self.allow_quoted_newlines = allow_quoted_newlines - self.allow_jagged_rows = allow_jagged_rows - - self.bigquery_conn_id = bigquery_conn_id - self.google_cloud_storage_conn_id = google_cloud_storage_conn_id - self.delegate_to = delegate_to - - self.src_fmt_configs = src_fmt_configs - self.labels = labels - - def execute(self, context): - bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, - delegate_to=self.delegate_to) - - if not self.schema_fields and self.schema_object \ - and self.source_format != 'DATASTORE_BACKUP': - gcs_hook = GoogleCloudStorageHook( - google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, - delegate_to=self.delegate_to) - schema_fields = json.loads(gcs_hook.download( - self.bucket, - self.schema_object).decode("utf-8")) - else: - schema_fields = self.schema_fields - - source_uris = ['gs://{}/{}'.format(self.bucket, source_object) - for source_object in self.source_objects] - conn = bq_hook.get_conn() - cursor = conn.cursor() - - cursor.create_external_table( - external_project_dataset_table=self.destination_project_dataset_table, - schema_fields=schema_fields, - source_uris=source_uris, - source_format=self.source_format, - compression=self.compression, - skip_leading_rows=self.skip_leading_rows, - field_delimiter=self.field_delimiter, - max_bad_records=self.max_bad_records, - quote_character=self.quote_character, - allow_quoted_newlines=self.allow_quoted_newlines, - allow_jagged_rows=self.allow_jagged_rows, - src_fmt_configs=self.src_fmt_configs, - labels=self.labels - ) - - -class BigQueryDeleteDatasetOperator(BaseOperator): - """" - This operator deletes an existing dataset from your Project in Big query. - https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/delete - :param project_id: The project id of the dataset. - :type project_id: string - :param dataset_id: The dataset to be deleted. - :type dataset_id: string - **Example**: :: - delete_temp_data = BigQueryDeleteDatasetOperator(dataset_id = 'temp-dataset', - project_id = 'temp-project', - bigquery_conn_id='_my_gcp_conn_', - task_id='Deletetemp', - dag=dag) - """ - - template_fields = ('dataset_id', 'project_id') - ui_color = '#f00004' - - @apply_defaults - def __init__(self, - dataset_id, - project_id=None, - bigquery_conn_id='bigquery_default', - delegate_to=None, - *args, **kwargs): - self.dataset_id = dataset_id - self.project_id = project_id - self.bigquery_conn_id = bigquery_conn_id - self.delegate_to = delegate_to - - self.log.info('Dataset id: %s', self.dataset_id) - self.log.info('Project id: %s', self.project_id) - - super(BigQueryDeleteDatasetOperator, self).__init__(*args, **kwargs) - - def execute(self, context): - bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, - delegate_to=self.delegate_to) - - conn = bq_hook.get_conn() - cursor = conn.cursor() - - cursor.delete_dataset( - project_id=self.project_id, - dataset_id=self.dataset_id - ) - - -class BigQueryCreateEmptyDatasetOperator(BaseOperator): - """" - This operator is used to create new dataset for your Project in Big query. - https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource - :param project_id: The name of the project where we want to create the dataset. - Don't need to provide, if projectId in dataset_reference. - :type project_id: str - :param dataset_id: The id of dataset. Don't need to provide, - if datasetId in dataset_reference. - :type dataset_id: str - :param dataset_reference: Dataset reference that could be provided with request body. - More info: - https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource - :type dataset_reference: dict - **Example**: :: - create_new_dataset = BigQueryCreateEmptyDatasetOperator( - dataset_id = 'new-dataset', - project_id = 'my-project', - dataset_reference = {"friendlyName": "New Dataset"} - bigquery_conn_id='_my_gcp_conn_', - task_id='newDatasetCreator', - dag=dag) - """ - - template_fields = ('dataset_id', 'project_id') - ui_color = '#f0eee4' - - @apply_defaults - def __init__(self, - dataset_id, - project_id=None, - dataset_reference=None, - bigquery_conn_id='bigquery_default', - delegate_to=None, - *args, **kwargs): - self.dataset_id = dataset_id - self.project_id = project_id - self.bigquery_conn_id = bigquery_conn_id - self.dataset_reference = dataset_reference if dataset_reference else {} - self.delegate_to = delegate_to - - self.log.info('Dataset id: %s', self.dataset_id) - self.log.info('Project id: %s', self.project_id) - - super(BigQueryCreateEmptyDatasetOperator, self).__init__(*args, **kwargs) - - def execute(self, context): - bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, - delegate_to=self.delegate_to) - - conn = bq_hook.get_conn() - cursor = conn.cursor() - - cursor.create_empty_dataset( - project_id=self.project_id, - dataset_id=self.dataset_id, - dataset_reference=self.dataset_reference) diff --git a/dags/operators/bq_sensor.py b/dags/operators/bq_sensor.py index d95aba481..cec0b382e 100644 --- a/dags/operators/bq_sensor.py +++ b/dags/operators/bq_sensor.py @@ -18,9 +18,8 @@ # under the License. from airflow.sensors.base_sensor_operator import BaseSensorOperator -from airflow.contrib.hooks.bigquery_hook import BigQueryHook -from airflow.utils.decorators import apply_defaults +from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook class BigQuerySQLSensorOperator(BaseSensorOperator): """ @@ -30,9 +29,9 @@ class BigQuerySQLSensorOperator(BaseSensorOperator): single value. If that value is coerced to false in some way, the sensor continues to wait. :type sql: str - :param bigquery_conn_id: The connection ID to use when connecting to + :param gcp_conn_id: The connection ID to use when connecting to Google BigQuery. - :type bigquery_conn_id: str + :type gcp_conn_id: str :param use_legacy_sql: Whether to use BQ legacy SQL :type use_legacy_sql: bool :param timeout: Time in seconds to wait for the sensor, @@ -40,14 +39,13 @@ class BigQuerySQLSensorOperator(BaseSensorOperator): :type timeout: int """ - template_fields = BaseSensorOperator.template_fields + [ + template_fields = BaseSensorOperator.template_fields + ( 'sql', - ] + ) - @apply_defaults def __init__(self, sql, - bigquery_conn_id='bigquery_default_conn', + gcp_conn_id='bigquery_default_conn', use_legacy_sql=False, timeout=60*60*24, *args, @@ -58,7 +56,7 @@ def __init__(self, *args, **kwargs) self.sql = sql - self.bigquery_conn_id = bigquery_conn_id + self.gcp_conn_id = gcp_conn_id self.use_legacy_sql = use_legacy_sql self.poke_interval = 120 self.mode = 'reschedule' @@ -78,5 +76,5 @@ def poke(self, context): return True def get_db_hook(self): - return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, + return BigQueryHook(gcp_conn_id=self.gcp_conn_id, use_legacy_sql=self.use_legacy_sql) diff --git a/dags/operators/gcp_container_operator.py b/dags/operators/gcp_container_operator.py index 2a12ecc3f..ab1a53013 100644 --- a/dags/operators/gcp_container_operator.py +++ b/dags/operators/gcp_container_operator.py @@ -1,38 +1,17 @@ -import os -import subprocess -import tempfile +from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator as UpstreamGKEPodOperator -from google.auth.environment_vars import CREDENTIALS - -from airflow import AirflowException - -from airflow.contrib.hooks.gcp_container_hook import GKEClusterHook - -from airflow.contrib.operators.gcp_container_operator import GKEPodOperator as UpstreamGKEPodOperator - -KUBE_CONFIG_ENV_VAR = "KUBECONFIG" -GCLOUD_APP_CRED = "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE" - -# Note: In the next version of airflow this will change. -# This module is deprecated. Please use `airflow.providers.google.cloud.operators.kubernetes_engine`. class GKEPodOperator(UpstreamGKEPodOperator): """ - We override execute and _set_env_from_extras methods to support: - - - `CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE` environment variable that is - set to the path of the Service Account JSON key file. This is neccesary - for gcloud to operate. - - - Adjust when NamedTemporaryFile file descriptor is closed. - - - Preserve XCOM result when do_xcom_push is True. - - - Override init to default image_pull_policy=Always, in_cluster=False, do_xcom_push=False and GKE params - + - In 1.10.x this inherited from upstream GKEPodOperator, rather than GKEStartPodOperator(v2) + - In 1.10.x we needed to override the execute and helper methods to set an environment +variable for authentication to work (CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE). Fixed in v2 + - We will keep this class and call the upstream GkeStartPodOperator now, because +numerous places in our code references it still + - Overrides init to default image_pull_policy=Always, in_cluster=False, +do_xcom_push=False and GKE params - Defaults reattach_on_restart=False to address a 1.10.12 regression where GkePodOperators reruns will simply attach to an existing pod and not perform any new work. - - Hard sets reattach_on_restart=False when do_xcom_push=True to address an error Retrying a failed task with do_xcom_push=True causes airflow to reattach to the pod eventually causing a 'Handshake status 500 Internal Server Error'. Logs will indicate @@ -75,75 +54,3 @@ def __init__(self, namespace=namespace, *args, **kwargs) - - def execute(self, context): - # We can remove this override once upgraded to 2.0. https://issues.apache.org/jira/browse/AIRFLOW-4072 - - # Moz specific - Commented out key_file references (Jason fixed auth behaviour with 1.10.2) - # key_file = None - - # If gcp_conn_id is not specified gcloud will use the default - # service account credentials. - if self.gcp_conn_id: - from airflow.hooks.base_hook import BaseHook - # extras is a deserialized json object - extras = BaseHook.get_connection(self.gcp_conn_id).extra_dejson - self._set_env_from_extras(extras=extras) # Moz specific since func no longer returns value - - # Write config to a temp file and set the environment variable to point to it. - # This is to avoid race conditions of reading/writing a single file - with tempfile.NamedTemporaryFile() as conf_file: - os.environ[KUBE_CONFIG_ENV_VAR] = conf_file.name - # Attempt to get/update credentials - # We call gcloud directly instead of using google-cloud-python api - # because there is no way to write kubernetes config to a file, which is - # required by KubernetesPodOperator. - # The gcloud command looks at the env variable `KUBECONFIG` for where to save - # the kubernetes config file. - subprocess.check_call( - ["gcloud", "container", "clusters", "get-credentials", - self.cluster_name, - "--zone", self.location, - "--project", self.project_id]) - - # if key_file: # Moz specific commented out - # key_file.close() # Moz specific commented out - - # Tell `KubernetesPodOperator` where the config file is located - self.config_file = os.environ[KUBE_CONFIG_ENV_VAR] - result = super(UpstreamGKEPodOperator, self).execute(context) # Moz specific - if self.do_xcom_push: # Moz specific - return result # Moz specific - - - def _set_env_from_extras(self, extras): - """ - Sets the environment variable `GOOGLE_APPLICATION_CREDENTIALS` and - `CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE`with either: - - - The path to the keyfile from the specified connection id - - A generated file's path if the user specified JSON in the connection id. The - file is assumed to be deleted after the process dies due to how mkstemp() - works. - - The environment variable is used inside the gcloud command to determine correct - service account to use. - """ - key_path = self._get_field(extras, 'key_path', False) - keyfile_json_str = self._get_field(extras, 'keyfile_dict', False) - - if not key_path and not keyfile_json_str: - self.log.info('Using gcloud with application default credentials.') - elif key_path: - os.environ[CREDENTIALS] = key_path - os.environ[GCLOUD_APP_CRED] = key_path - return None - else: - # Write service account JSON to secure file for gcloud to reference - service_key = tempfile.NamedTemporaryFile(delete=False) - service_key.write(keyfile_json_str.encode('utf-8')) - os.environ[CREDENTIALS] = service_key.name - os.environ[GCLOUD_APP_CRED] = service_key.name - # Return file object to have a pointer to close after use, - # thus deleting from file system. - service_key.close() # Moz specific instead of return service_key diff --git a/dags/operators/moz_dataproc_operator.py b/dags/operators/moz_dataproc_operator.py deleted file mode 100644 index dabc13259..000000000 --- a/dags/operators/moz_dataproc_operator.py +++ /dev/null @@ -1,459 +0,0 @@ -import os -import re -import time -import uuid -from datetime import timedelta - -from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook -from airflow.contrib.operators.dataproc_operator import DataprocOperationBaseOperator -from airflow.exceptions import AirflowException -from airflow.models import BaseOperator -from airflow.utils.decorators import apply_defaults -from airflow.utils import timezone -from airflow.version import version - -""" -We overwrite DataprocClusterCreateOperator here to create clusters with an option to -install component gateway, which we install by default. We also add labels to the gce -cluster config. - -Previously on 1.10.2, we had to include DataprocOperationBaseOperator from master -which used the v1beta2 rest api for creating clusters allowing us to install optional -components and component gateway, but this class has been updated since 1.10.4. - -""" - -# pylint: disable=too-many-instance-attributes -class DataprocClusterCreateOperator(DataprocOperationBaseOperator): - """ - -- - Pulled from 1.10.7 - - We modify the _build_gce_cluster_config method to install component gateway. - -- - Create a new cluster on Google Cloud Dataproc. The operator will wait until the - creation is successful or an error occurs in the creation process. - - The parameters allow to configure the cluster. Please refer to - - https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters - - for a detailed explanation on the different parameters. Most of the configuration - parameters detailed in the link are available as a parameter to this operator. - - :param cluster_name: The name of the DataProc cluster to create. (templated) - :type cluster_name: str - :param project_id: The ID of the google cloud project in which - to create the cluster. (templated) - :type project_id: str - :param num_workers: The # of workers to spin up. If set to zero will - spin up cluster in a single node mode - :type num_workers: int - :param storage_bucket: The storage bucket to use, setting to None lets dataproc - generate a custom one for you - :type storage_bucket: str - :param init_actions_uris: List of GCS uri's containing - dataproc initialization scripts - :type init_actions_uris: list[str] - :param init_action_timeout: Amount of time executable scripts in - init_actions_uris has to complete - :type init_action_timeout: str - :param metadata: dict of key-value google compute engine metadata entries - to add to all instances - :type metadata: dict - :param image_version: the version of software inside the Dataproc cluster - :type image_version: str - :param custom_image: custom Dataproc image for more info see - https://cloud.google.com/dataproc/docs/guides/dataproc-images - :type custom_image: str - :param custom_image_project_id: project id for the custom Dataproc image, for more info see - https://cloud.google.com/dataproc/docs/guides/dataproc-images - :type custom_image_project_id: str - :param autoscaling_policy: The autoscaling policy used by the cluster. Only resource names - including projectid and location (region) are valid. Example: - ``projects/[projectId]/locations/[dataproc_region]/autoscalingPolicies/[policy_id]`` - :type autoscaling_policy: str - :param properties: dict of properties to set on - config files (e.g. spark-defaults.conf), see - https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#SoftwareConfig - :type properties: dict - :param optional_components: List of optional cluster components, for more info see - https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig#Component - :type optional_components: list[str] - :param num_masters: The # of master nodes to spin up - :type num_masters: int - :param master_machine_type: Compute engine machine type to use for the master node - :type master_machine_type: str - :param master_disk_type: Type of the boot disk for the master node - (default is ``pd-standard``). - Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or - ``pd-standard`` (Persistent Disk Hard Disk Drive). - :type master_disk_type: str - :param master_disk_size: Disk size for the master node - :type master_disk_size: int - :param master_num_local_ssds : Number of local SSDs to mount. Local SSDs are used for writing and reading Apache Hadoop and Apache Spark scratch files, such as shuffle outputs. Adding SSDs will improve Spark runtime performance. - (default is 0) - :type master_num_local_ssds : int - :param worker_machine_type: Compute engine machine type to use for the worker nodes - :type worker_machine_type: str - :param worker_disk_type: Type of the boot disk for the worker node - (default is ``pd-standard``). - Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or - ``pd-standard`` (Persistent Disk Hard Disk Drive). - :type worker_disk_type: str - :param worker_disk_size: Disk size for the worker nodes - :type worker_disk_size: int - :param worker_num_local_ssds : Number of local SSDs to mount. Local SSDs are used for writing and reading Apache Hadoop and Apache Spark scratch files, such as shuffle outputs. Adding SSDs will improve Spark runtime performance. - (default is 0) - :type worker_num_local_ssds : int - :param num_preemptible_workers: The # of preemptible worker nodes to spin up - :type num_preemptible_workers: int - :param labels: dict of labels to add to the cluster - :type labels: dict - :param zone: The zone where the cluster will be located. Set to None to auto-zone. (templated) - :type zone: str - :param network_uri: The network uri to be used for machine communication, cannot be - specified with subnetwork_uri - :type network_uri: str - :param subnetwork_uri: The subnetwork uri to be used for machine communication, - cannot be specified with network_uri - :type subnetwork_uri: str - :param internal_ip_only: If true, all instances in the cluster will only - have internal IP addresses. This can only be enabled for subnetwork - enabled networks - :type internal_ip_only: bool - :param tags: The GCE tags to add to all instances - :type tags: list[str] - :param region: leave as 'global', might become relevant in the future. (templated) - :type region: str - :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: str - :param delegate_to: The account to impersonate, if any. - For this to work, the service account making the request must have domain-wide - delegation enabled. - :type delegate_to: str - :param service_account: The service account of the dataproc instances. - :type service_account: str - :param service_account_scopes: The URIs of service account scopes to be included. - :type service_account_scopes: list[str] - :param idle_delete_ttl: The longest duration that cluster would keep alive while - staying idle. Passing this threshold will cause cluster to be auto-deleted. - A duration in seconds. - :type idle_delete_ttl: int - :param auto_delete_time: The time when cluster will be auto-deleted. - :type auto_delete_time: datetime.datetime - :param auto_delete_ttl: The life duration of cluster, the cluster will be - auto-deleted at the end of this duration. - A duration in seconds. (If auto_delete_time is set this parameter will be ignored) - :type auto_delete_ttl: int - :param customer_managed_key: The customer-managed key used for disk encryption - ``projects/[PROJECT_STORING_KEYS]/locations/[LOCATION]/keyRings/[KEY_RING_NAME]/cryptoKeys/[KEY_NAME]`` # noqa # pylint: disable=line-too-long - :type customer_managed_key: str - - Moz specific - :param install_component_gateway: Install alpha feature component gateway. - :type install_component_gateway: boolean - - """ - - template_fields = ['cluster_name', 'project_id', 'zone', 'region'] - - # pylint: disable=too-many-arguments,too-many-locals - @apply_defaults - def __init__(self, - project_id, - cluster_name, - num_workers, - job_name=None, # Moz specific - zone=None, - network_uri=None, - subnetwork_uri=None, - internal_ip_only=None, - tags=None, - storage_bucket=None, - init_actions_uris=None, - init_action_timeout="10m", - metadata=None, - custom_image=None, - custom_image_project_id=None, - image_version=None, - autoscaling_policy=None, - properties=None, - optional_components=['ANACONDA'], # Moz specific - num_masters=1, - master_machine_type='n1-standard-4', - master_disk_type='pd-standard', - master_disk_size=500, - master_num_local_ssds=0, - worker_machine_type='n1-standard-4', - worker_disk_type='pd-standard', - worker_disk_size=500, - worker_num_local_ssds=0, - num_preemptible_workers=0, - labels=None, - region='global', - service_account=None, - service_account_scopes=None, - idle_delete_ttl=None, - auto_delete_time=None, - auto_delete_ttl=None, - customer_managed_key=None, - install_component_gateway=True, # Moz specific - *args, - **kwargs): - - super(DataprocClusterCreateOperator, self).__init__( - project_id=project_id, region=region, *args, **kwargs) - self.cluster_name = cluster_name - self.job_name = job_name - self.num_masters = num_masters - self.num_workers = num_workers - self.num_preemptible_workers = num_preemptible_workers - self.storage_bucket = storage_bucket - self.init_actions_uris = init_actions_uris - self.init_action_timeout = init_action_timeout - self.metadata = metadata - self.custom_image = custom_image - self.custom_image_project_id = custom_image_project_id - self.image_version = image_version - self.properties = properties or dict() - self.optional_components = optional_components - self.master_machine_type = master_machine_type - self.master_disk_type = master_disk_type - self.master_disk_size = master_disk_size - self.master_num_local_ssds = master_num_local_ssds - self.autoscaling_policy = autoscaling_policy - self.worker_machine_type = worker_machine_type - self.worker_disk_type = worker_disk_type - self.worker_disk_size = worker_disk_size - self.worker_num_local_ssds = worker_num_local_ssds - self.labels = labels - self.zone = zone - self.network_uri = network_uri - self.subnetwork_uri = subnetwork_uri - self.internal_ip_only = internal_ip_only - self.tags = tags - self.service_account = service_account - self.service_account_scopes = service_account_scopes - self.idle_delete_ttl = idle_delete_ttl - self.auto_delete_time = auto_delete_time - self.auto_delete_ttl = auto_delete_ttl - self.customer_managed_key = customer_managed_key - self.single_node = num_workers == 0 - self.install_component_gateway = install_component_gateway # Moz specific - - assert not (self.custom_image and self.image_version), \ - "custom_image and image_version can't be both set" - - assert ( - not self.single_node or ( - self.single_node and self.num_preemptible_workers == 0 - ) - ), "num_workers == 0 means single node mode - no preemptibles allowed" - - def _get_init_action_timeout(self): - match = re.match(r"^(\d+)(s|m)$", self.init_action_timeout) - if match: - if match.group(2) == "s": - return self.init_action_timeout - elif match.group(2) == "m": - val = float(match.group(1)) - return "{}s".format(timedelta(minutes=val).seconds) - - raise AirflowException( - "DataprocClusterCreateOperator init_action_timeout" - " should be expressed in minutes or seconds. i.e. 10m, 30s") - - def _build_gce_cluster_config(self, cluster_data): - """ - We optionally add alpha feature 'enable component gateway' - - """ - - if self.install_component_gateway: # Moz specific start - # Fetch current nested dict and add nested keys - cluster_config_new = cluster_data['config'] - cluster_config_new.update({'endpointConfig' : {'enableHttpPortAccess' : True}}) - - # Overwrite the config key with newly created - cluster_data.update({'config' : cluster_config_new}) # Moz specific end - - - if self.zone: - zone_uri = \ - 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( - self.project_id, self.zone - ) - cluster_data['config']['gceClusterConfig']['zoneUri'] = zone_uri - - if self.metadata: - cluster_data['config']['gceClusterConfig']['metadata'] = self.metadata - - if self.network_uri: - cluster_data['config']['gceClusterConfig']['networkUri'] = self.network_uri - - if self.subnetwork_uri: - cluster_data['config']['gceClusterConfig']['subnetworkUri'] = \ - self.subnetwork_uri - - if self.internal_ip_only: - if not self.subnetwork_uri: - raise AirflowException("Set internal_ip_only to true only when" - " you pass a subnetwork_uri.") - cluster_data['config']['gceClusterConfig']['internalIpOnly'] = True - - if self.tags: - cluster_data['config']['gceClusterConfig']['tags'] = self.tags - - if self.service_account: - cluster_data['config']['gceClusterConfig']['serviceAccount'] = \ - self.service_account - - if self.service_account_scopes: - cluster_data['config']['gceClusterConfig']['serviceAccountScopes'] = \ - self.service_account_scopes - - return cluster_data - - def _build_lifecycle_config(self, cluster_data): - if self.idle_delete_ttl: - cluster_data['config']['lifecycleConfig']['idleDeleteTtl'] = \ - "{}s".format(self.idle_delete_ttl) - - if self.auto_delete_time: - utc_auto_delete_time = timezone.convert_to_utc(self.auto_delete_time) - cluster_data['config']['lifecycleConfig']['autoDeleteTime'] = \ - utc_auto_delete_time.format('%Y-%m-%dT%H:%M:%S.%fZ', formatter='classic') - elif self.auto_delete_ttl: - cluster_data['config']['lifecycleConfig']['autoDeleteTtl'] = \ - "{}s".format(self.auto_delete_ttl) - - return cluster_data - - def _build_cluster_data(self): - if self.zone: - master_type_uri = \ - "https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}"\ - .format(self.project_id, self.zone, self.master_machine_type) - worker_type_uri = \ - "https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}"\ - .format(self.project_id, self.zone, self.worker_machine_type) - else: - master_type_uri = self.master_machine_type - worker_type_uri = self.worker_machine_type - - cluster_data = { - 'projectId': self.project_id, - 'clusterName': self.cluster_name, - 'labels': {}, - 'config': { - 'gceClusterConfig': { - }, - 'masterConfig': { - 'numInstances': self.num_masters, - 'machineTypeUri': master_type_uri, - 'diskConfig': { - 'bootDiskType': self.master_disk_type, - 'bootDiskSizeGb': self.master_disk_size, - 'numLocalSsds': self.master_num_local_ssds, - } - }, - 'workerConfig': { - 'numInstances': self.num_workers, - 'machineTypeUri': worker_type_uri, - 'diskConfig': { - 'bootDiskType': self.worker_disk_type, - 'bootDiskSizeGb': self.worker_disk_size, - 'numLocalSsds': self.worker_num_local_ssds, - } - }, - 'secondaryWorkerConfig': {}, - 'softwareConfig': {}, - 'lifecycleConfig': {}, - 'encryptionConfig': {}, - 'autoscalingConfig': {}, - } - } - if self.num_preemptible_workers > 0: - cluster_data['config']['secondaryWorkerConfig'] = { - 'numInstances': self.num_preemptible_workers, - 'machineTypeUri': worker_type_uri, - 'diskConfig': { - 'bootDiskType': self.worker_disk_type, - 'bootDiskSizeGb': self.worker_disk_size - }, - 'isPreemptible': True - } - - cluster_data['labels'] = self.labels or {} - - # Dataproc labels must conform to the following regex: - # [a-z]([-a-z0-9]*[a-z0-9])? (current airflow version string follows - # semantic versioning spec: x.y.z). - cluster_data['labels'].update({'airflow-version': - 'v' + version.replace('.', '-').replace('+', '-')}) - # Moz specific - cluster_data['labels'].update({'owner': self.owner.lower().replace('@mozilla.com', '').replace('.', '-'), - 'env': os.getenv('DEPLOY_ENVIRONMENT', 'env_not_set'), - 'jobname': self.job_name.lower().replace('_', '-')}) - - if self.storage_bucket: - cluster_data['config']['configBucket'] = self.storage_bucket - - if self.image_version: - cluster_data['config']['softwareConfig']['imageVersion'] = self.image_version - - elif self.custom_image: - project_id = self.custom_image_project_id if (self.custom_image_project_id) else self.project_id - custom_image_url = 'https://www.googleapis.com/compute/beta/projects/' \ - '{}/global/images/{}'.format(project_id, - self.custom_image) - cluster_data['config']['masterConfig']['imageUri'] = custom_image_url - if not self.single_node: - cluster_data['config']['workerConfig']['imageUri'] = custom_image_url - - cluster_data = self._build_gce_cluster_config(cluster_data) - - if self.single_node: - self.properties["dataproc:dataproc.allow.zero.workers"] = "true" - - if self.properties: - cluster_data['config']['softwareConfig']['properties'] = self.properties - - if self.optional_components: - cluster_data['config']['softwareConfig']['optionalComponents'] = self.optional_components - - cluster_data = self._build_lifecycle_config(cluster_data) - - if self.init_actions_uris: - init_actions_dict = [ - { - 'executableFile': uri, - 'executionTimeout': self._get_init_action_timeout() - } for uri in self.init_actions_uris - ] - cluster_data['config']['initializationActions'] = init_actions_dict - - if self.customer_managed_key: - cluster_data['config']['encryptionConfig'] =\ - {'gcePdKmsKeyName': self.customer_managed_key} - if self.autoscaling_policy: - cluster_data['config']['autoscalingConfig'] = {'policyUri': self.autoscaling_policy} - - return cluster_data - - def start(self): - """ - Create a new cluster on Google Cloud Dataproc. - """ - self.log.info('Creating cluster: %s', self.cluster_name) - cluster_data = self._build_cluster_data() - - return ( - self.hook.get_conn().projects().regions().clusters().create( # pylint: disable=no-member - projectId=self.project_id, - region=self.region, - body=cluster_data, - requestId=str(uuid.uuid4()), - ).execute()) - diff --git a/dags/operators/sleep_operator.py b/dags/operators/sleep_operator.py index f47aed5b0..a59686b35 100644 --- a/dags/operators/sleep_operator.py +++ b/dags/operators/sleep_operator.py @@ -1,10 +1,8 @@ from airflow.models import BaseOperator -from airflow.utils.decorators import apply_defaults import logging import time class SleepOperator(BaseOperator): - @apply_defaults def __init__(self, sleep_time=30, *args, **kwargs): super(SleepOperator, self).__init__(*args, **kwargs) self.sleep_time=sleep_time diff --git a/dags/operators/task_sensor.py b/dags/operators/task_sensor.py index f84a3c1c3..912db98af 100644 --- a/dags/operators/task_sensor.py +++ b/dags/operators/task_sensor.py @@ -5,9 +5,8 @@ from airflow.exceptions import AirflowException from airflow.models import DagBag, DagModel, DagRun, TaskInstance -from airflow.operators.sensors import ExternalTaskSensor +from airflow.sensors.external_task import ExternalTaskSensor from airflow.utils.db import provide_session -from airflow.utils.decorators import apply_defaults from airflow.utils.state import State @@ -28,7 +27,6 @@ class ExternalTaskCompletedSensor(ExternalTaskSensor): """ - @apply_defaults def __init__(self, failed_states = None, *args, **kwargs): super().__init__(*args, **kwargs) self.failed_states = failed_states or [State.FAILED, State.UPSTREAM_FAILED, State.SKIPPED] @@ -41,7 +39,7 @@ def poke(self, context, session=None): dttm = context['execution_date'] - self.execution_delta elif self.execution_date_fn: # Moz specific - _handle_execution_date_fn may not be defined in this context - raise AirflowException("execution_date_fn is not supported by this sensor.") + raise AirflowException("execution_date_fn is not supported by this custom mozilla sensor.") else: dttm = context['execution_date'] @@ -121,4 +119,4 @@ def poke(self, context, session=None): raise AirflowException(f'The external DAG {self.external_dag_id} failed.') session.commit() - return count_allowed == len(dttm_filter) \ No newline at end of file + return count_allowed == len(dttm_filter) diff --git a/dags/parquet_export.py b/dags/parquet_export.py index 169dc5d7a..8ed8325d1 100644 --- a/dags/parquet_export.py +++ b/dags/parquet_export.py @@ -1,14 +1,8 @@ from airflow import DAG from datetime import datetime, timedelta -from airflow.contrib.hooks.aws_hook import AwsHook -from airflow.executors import get_default_executor from airflow.operators.subdag_operator import SubDagOperator from operators.task_sensor import ExternalTaskCompletedSensor -from utils.dataproc import ( - moz_dataproc_pyspark_runner, - moz_dataproc_jar_runner, - get_dataproc_parameters, -) + from utils.gcp import ( bigquery_etl_query, bigquery_etl_copy_deduplicate, @@ -91,7 +85,6 @@ default_args=default_args, num_workers=40), task_id="main_summary_export", - executor=get_default_executor(), dag=dag) clients_daily_export = SubDagOperator( @@ -141,7 +134,6 @@ default_args=default_args, num_preemptible_workers=10), task_id="clients_daily_export", - executor=get_default_executor(), dag=dag) wait_for_clients_daily = ExternalTaskCompletedSensor( diff --git a/dags/prio/dataproc.py b/dags/prio/dataproc.py index 6bf96b67f..5438e1a04 100644 --- a/dags/prio/dataproc.py +++ b/dags/prio/dataproc.py @@ -1,17 +1,16 @@ from airflow import DAG -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from airflow.contrib.operators.dataproc_operator import ( - DataprocClusterCreateOperator, - DataprocClusterDeleteOperator, - DataProcPySparkOperator, +from airflow.providers.google.cloud.operators.dataproc import ( + DataprocCreateClusterOperator, + DataprocDeleteClusterOperator, + DataprocSubmitPySparkJobOperator, ) - def spark_subdag( parent_dag_name, child_dag_name, default_args, gcp_conn_id, + project_id, service_account, main, pyfiles, @@ -27,6 +26,7 @@ def spark_subdag( :param str child_dag_name: Name of the child DAG. :param Dict[str, Any] default_args: Default arguments for the child DAG. :param str gcp_conn_id: Name of the connection string. + :param str project_id: GCP project id corresponding to the gcp_conn_id. :param str service_account: The address of the service account. :param str dataproc_region: The region of the DataProc cluster. :param str main: @@ -36,12 +36,10 @@ def spark_subdag( :return: DAG """ - connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) - shared_config = { "cluster_name": "prio-staging-{{ds_nodash}}", "gcp_conn_id": gcp_conn_id, - "project_id": connection.project_id, + "project_id": project_id, # From an error when not specifying the region: # - Dataproc images 2.0 and higher do not support the to-be # deprecated global region. Please use any non-global Dataproc @@ -54,7 +52,7 @@ def spark_subdag( } with DAG(f"{parent_dag_name}.{child_dag_name}", default_args=default_args) as dag: - create_dataproc_cluster = DataprocClusterCreateOperator( + create_dataproc_cluster = DataprocCreateClusterOperator( task_id="create_dataproc_cluster", image_version="preview-ubuntu18", service_account=service_account, @@ -68,10 +66,10 @@ def spark_subdag( **shared_config, ) - run_dataproc_spark = DataProcPySparkOperator( + run_dataproc_spark = DataprocSubmitPySparkJobOperator( task_id="run_dataproc_spark", main=main, - dataproc_pyspark_jars=[ + dataproc_jars=[ "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" ], pyfiles=pyfiles, @@ -80,7 +78,7 @@ def spark_subdag( **shared_config, ) - delete_dataproc_cluster = DataprocClusterDeleteOperator( + delete_dataproc_cluster = DataprocDeleteClusterOperator( task_id="delete_dataproc_cluster", trigger_rule="all_done", dag=dag, diff --git a/dags/prio/kubernetes.py b/dags/prio/kubernetes.py index bb3831485..ddf1cec7b 100644 --- a/dags/prio/kubernetes.py +++ b/dags/prio/kubernetes.py @@ -2,10 +2,9 @@ from os import environ from airflow import DAG -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from airflow.contrib.operators.gcp_container_operator import ( - GKEClusterCreateOperator, - GKEClusterDeleteOperator, +from airflow.providers.google.cloud.operators.kubernetes_engine import ( + GKECreateClusterOperator, + GKEDeleteClusterOperator, ) from airflow.operators.bash_operator import BashOperator from operators.gcp_container_operator import GKEPodOperator @@ -17,6 +16,7 @@ def container_subdag( child_dag_name, default_args, gcp_conn_id, + project_id, service_account, server_id, env_vars={}, @@ -35,6 +35,7 @@ def container_subdag( :param str child_dag_name: Name of the child DAG. :param Dict[str, Any] default_args: Default arguments for the child DAG. :param str gcp_conn_id: Name of the connection string. + :param str project_id: GCP project id associated with the gcp_conn_id. :param str service_account: The address of the service account. :param str server_id: The identifier for the Prio processor :param Dict[str, str] env_vars: Environment variables for configuring @@ -50,12 +51,10 @@ def container_subdag( """ assert server_id in ["a", "b", "admin"] - connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) - cluster_name = f"gke-prio-{server_id}" shared_config = { - "project_id": connection.project_id, + "project_id": project_id, "gcp_conn_id": gcp_conn_id, "location": location, } @@ -67,7 +66,7 @@ def container_subdag( # https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/ # https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator # https://airflow.apache.org/docs/stable/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html - create_gke_cluster = GKEClusterCreateOperator( + create_gke_cluster = GKECreateClusterOperator( task_id="create_gke_cluster", body=create_gke_config( name=cluster_name, @@ -141,7 +140,7 @@ def container_subdag( **kwargs, ) - delete_gke_cluster = GKEClusterDeleteOperator( + delete_gke_cluster = GKEDeleteClusterOperator( task_id="delete_gke_cluster", name=cluster_name, trigger_rule="all_done", diff --git a/dags/prio/processor.py b/dags/prio/processor.py index 30bfcf618..0f6687068 100644 --- a/dags/prio/processor.py +++ b/dags/prio/processor.py @@ -37,10 +37,9 @@ from functools import partial from airflow import DAG -from airflow.contrib.operators.gcs_to_gcs import ( - GoogleCloudStorageToGoogleCloudStorageOperator, -) -from airflow.operators import DummyOperator, PythonOperator +from airflow.providers.google.cloud.transfers.gcs_to_gcs import GCSToGCSOperator +from airflow.operators.python import PythonOperator +from airflow.operators.dummy import DummyOperator from airflow.operators.subdag_operator import SubDagOperator from prio import dataproc, kubernetes @@ -56,7 +55,7 @@ def transfer_data_subdag( submission_date, server_id, public_key_hex_external, - google_cloud_storage_conn_id, + gcp_conn_id, ): """Copy the partitioned data from the staging bucket into the corresponding receiving buckets in each processor. The job then submits a `_SUCCESS` file @@ -78,22 +77,22 @@ def transfer_data_subdag( "raw/shares", ] ) - transfer_dataset = GoogleCloudStorageToGoogleCloudStorageOperator( + transfer_dataset = GCSToGCSOperator( task_id="transfer_dataset", source_bucket=source_bucket, source_object=f"staging/submission_date={submission_date}/server_id={server_id}/*", destination_bucket=destination_bucket, destination_object=f"{prefix}/", - google_cloud_storage_conn_id=google_cloud_storage_conn_id, + gcp_conn_id=gcp_conn_id, dag=dag, ) - mark_dataset_success = GoogleCloudStorageToGoogleCloudStorageOperator( + mark_dataset_success = GCSToGCSOperator( task_id="mark_dataset_success", source_bucket=source_bucket, source_object="staging/_SUCCESS", destination_bucket=destination_bucket, destination_object=f"{prefix}/_SUCCESS", - google_cloud_storage_conn_id=google_cloud_storage_conn_id, + gcp_conn_id=gcp_conn_id, dag=dag, ) transfer_dataset >> mark_dataset_success @@ -104,6 +103,7 @@ def ingestion_subdag( dag, default_args, gcp_conn_id, + project_id, service_account, bucket_bootstrap_admin, bucket_data_admin, @@ -125,13 +125,14 @@ def ingestion_subdag( default_args=default_args, server_id="admin", gcp_conn_id=gcp_conn_id, + project_id=project_id, service_account=service_account, arguments=[ "bash", "-xc", f"source bin/dataproc; bootstrap gs://{bucket_bootstrap_admin}", ], - env_var=dict(SUBMODULE="origin"), + env_vars=dict(SUBMODULE="origin"), ), task_id="bootstrap", dag=dag, @@ -144,6 +145,7 @@ def ingestion_subdag( child_dag_name="staging", default_args=default_args, gcp_conn_id=gcp_conn_id, + project_id=project_id, service_account=service_account, main=f"gs://{bucket_bootstrap_admin}/processor-origin.py", pyfiles=[f"gs://{bucket_bootstrap_admin}/prio_processor.egg"], @@ -175,7 +177,7 @@ def ingestion_subdag( destination_bucket_prefix=bucket_prefix, app_name=app_name, submission_date="{{ ds }}", - google_cloud_storage_conn_id=gcp_conn_id, + gcp_conn_id=gcp_conn_id, ) transfer_a = SubDagOperator( @@ -213,7 +215,7 @@ def ingestion_subdag( def prio_processor_subdag( - dag, default_args, gcp_conn_id, service_account, server_id, env_vars + dag, default_args, gcp_conn_id, project_id, service_account, server_id, env_vars ): return SubDagOperator( subdag=kubernetes.container_subdag( @@ -221,6 +223,7 @@ def prio_processor_subdag( child_dag_name=f"processor_{server_id}", default_args=default_args, gcp_conn_id=gcp_conn_id, + project_id=project_id, service_account=service_account, server_id=server_id, arguments=["bin/process"], @@ -231,7 +234,7 @@ def prio_processor_subdag( ) -def load_bigquery_subdag(dag, default_args, gcp_conn_id, service_account, env_vars): +def load_bigquery_subdag(dag, default_args, gcp_conn_id, project_id, service_account, env_vars): # Take the resulting aggregates and insert them into a BigQuery table. This # table is effectively append-only, so rerunning the dag will cause duplicate # results. In practice, rerunning the DAG is problematic when operation is @@ -243,6 +246,7 @@ def load_bigquery_subdag(dag, default_args, gcp_conn_id, service_account, env_va default_args=default_args, server_id="admin", gcp_conn_id=gcp_conn_id, + project_id=project_id, service_account=service_account, arguments=["bash", "-c", "bin/insert"], env_vars=env_vars, diff --git a/dags/prio_processor.py b/dags/prio_processor.py index 4d296cd49..41953c9da 100644 --- a/dags/prio_processor.py +++ b/dags/prio_processor.py @@ -2,7 +2,6 @@ from os import environ from airflow import DAG -from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook from prio.processor import ingestion_subdag, load_bigquery_subdag, prio_processor_subdag DEFAULT_ARGS = { @@ -26,13 +25,12 @@ ENVIRONMENT = "dev" if IS_DEV else "prod" PRIO_ADMIN_CONN = "google_cloud_prio_admin" +PRIO_ADMIN_PROJECT_ID = "moz-fx-prio-admin-prod-098j" PRIO_A_CONN = "google_cloud_prio_a" +PRIO_A_PROJECT_ID = "moz-fx-prio-a-prod-kju7" -PROJECT_ADMIN = GoogleCloudStorageHook(PRIO_ADMIN_CONN).project_id -PROJECT_A = GoogleCloudStorageHook(PRIO_A_CONN).project_id - -SERVICE_ACCOUNT_ADMIN = f"prio-admin-runner@{PROJECT_ADMIN}.iam.gserviceaccount.com" -SERVICE_ACCOUNT_A = f"prio-runner-{ENVIRONMENT}-a@{PROJECT_A}.iam.gserviceaccount.com" +SERVICE_ACCOUNT_ADMIN = f"prio-admin-runner@{PRIO_ADMIN_PROJECT_ID}.iam.gserviceaccount.com" +SERVICE_ACCOUNT_A = f"prio-runner-{ENVIRONMENT}-a@{PRIO_A_PROJECT_ID}.iam.gserviceaccount.com" # Private bucket of server B is necessary for transfer BUCKET_PRIVATE_A = f"moz-fx-prio-{ENVIRONMENT}-a-private" @@ -58,6 +56,7 @@ dag, DEFAULT_ARGS, PRIO_ADMIN_CONN, + PRIO_ADMIN_PROJECT_ID, SERVICE_ACCOUNT_ADMIN, BUCKET_BOOTSTRAP_ADMIN, BUCKET_DATA_ADMIN, @@ -73,6 +72,7 @@ dag, DEFAULT_ARGS, PRIO_A_CONN, + PRIO_A_PROJECT_ID, SERVICE_ACCOUNT_A, "a", { @@ -99,6 +99,7 @@ dag, DEFAULT_ARGS, PRIO_ADMIN_CONN, + PRIO_ADMIN_PROJECT_ID, SERVICE_ACCOUNT_ADMIN, env_vars={ "APP_NAME": APP_NAME, diff --git a/dags/prio_processor_external.py b/dags/prio_processor_external.py index 472accfe4..ae1e31a2a 100644 --- a/dags/prio_processor_external.py +++ b/dags/prio_processor_external.py @@ -2,7 +2,6 @@ from os import environ from airflow import DAG -from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook from prio.processor import prio_processor_subdag DEFAULT_ARGS = { @@ -25,7 +24,7 @@ IS_DEV = environ.get("DEPLOY_ENVIRONMENT") != "prod" ENVIRONMENT = "dev" if IS_DEV else "prod" PRIO_B_CONN = "google_cloud_prio_b" -PROJECT_B = GoogleCloudStorageHook(PRIO_B_CONN).project_id +PROJECT_B = "moz-fx-prio-b-prod-a67n" SERVICE_ACCOUNT_B = f"prio-runner-{ENVIRONMENT}-b@{PROJECT_B}.iam.gserviceaccount.com" BUCKET_PRIVATE_B = f"moz-fx-prio-{ENVIRONMENT}-b-private" BUCKET_SHARED_A = f"moz-fx-prio-{ENVIRONMENT}-a-shared" @@ -48,6 +47,7 @@ dag, DEFAULT_ARGS, PRIO_B_CONN, + PROJECT_B, SERVICE_ACCOUNT_B, "b", { diff --git a/dags/probe_scraper.py b/dags/probe_scraper.py index e7fb7c279..9f9cf822f 100644 --- a/dags/probe_scraper.py +++ b/dags/probe_scraper.py @@ -2,13 +2,13 @@ from datetime import datetime, timedelta from airflow import DAG -from airflow.contrib.hooks.aws_hook import AwsHook -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook from airflow.models import Variable from airflow.operators.http_operator import SimpleHttpOperator from airflow.operators.python_operator import PythonOperator from operators.gcp_container_operator import GKEPodOperator + DOCS = """\ # Probe Scraper @@ -36,6 +36,9 @@ Adjust the time window as needed and you should be able to see logs associated with the failure. """ +DEFAULT_LOOKML_GENERATOR_IMAGE_VERSION = "v1.17.0" + + default_args = { 'owner': 'dthorn@mozilla.com', 'depends_on_past': False, @@ -52,7 +55,7 @@ schedule_interval='0 0 * * 1-5') as dag: aws_conn_id='aws_prod_probe_scraper' - aws_access_key, aws_secret_key, session = AwsHook(aws_conn_id).get_credentials() + aws_access_key, aws_secret_key, session = AwsBaseHook(aws_conn_id=aws_conn_id, client_type='s3').get_credentials() # Built from repo https://github.com/mozilla/probe-scraper probe_scraper_image='gcr.io/moz-fx-data-airflow-prod-88e0/probe-scraper:latest' @@ -132,15 +135,20 @@ probe_scraper >> delay_python_task gcp_gke_conn_id = "google_cloud_airflow_gke" + project_id = "moz-fx-data-airflow-gke-prod" + image_tag = Variable.get("lookml_generator_release_str") + if image_tag is None: + image_tag = DEFAULT_LOOKML_GENERATOR_IMAGE_VERSION + lookml_generator_prod = GKEPodOperator( owner="ascholtz@mozilla.com", email=["ascholtz@mozilla.com", "dataops+alerts@mozilla.com"], task_id="lookml_generator", name="lookml-generator-1", - image="gcr.io/moz-fx-data-airflow-prod-88e0/lookml-generator:" + Variable.get("lookml_generator_release_str"), + image="gcr.io/moz-fx-data-airflow-prod-88e0/lookml-generator:" + image_tag, startup_timeout_seconds=500, gcp_conn_id=gcp_gke_conn_id, - project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_gke_conn_id).project_id, + project_id=project_id, cluster_name="workloads-prod-v1", location="us-west1", dag=dag, @@ -168,7 +176,7 @@ name="lookml-generator-staging-1", image="gcr.io/moz-fx-data-airflow-prod-88e0/lookml-generator:latest", gcp_conn_id=gcp_gke_conn_id, - project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_gke_conn_id).project_id, + project_id=project_id, cluster_name="workloads-prod-v1", location="us-west1", dag=dag, diff --git a/dags/socorro_import.py b/dags/socorro_import.py index a1057b355..3a2e131dd 100644 --- a/dags/socorro_import.py +++ b/dags/socorro_import.py @@ -1,12 +1,11 @@ from airflow import DAG from airflow.operators.subdag_operator import SubDagOperator -from airflow.contrib.hooks.aws_hook import AwsHook -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from airflow.contrib.operators.bigquery_table_delete_operator import ( - BigQueryTableDeleteOperator, +from airflow.providers.google.cloud.operators.bigquery import ( + BigQueryDeleteTableOperator, ) -from airflow.contrib.operators.gcp_transfer_operator import ( - S3ToGoogleCloudStorageTransferOperator, + +from airflow.providers.google.cloud.operators.cloud_storage_transfer_service import ( + CloudDataTransferServiceS3ToGCSOperator ) from datetime import datetime, timedelta @@ -53,7 +52,7 @@ # Defined in Airflow's UI -> Admin -> Connections gcp_conn_id = "google_cloud_airflow_dataproc" -connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) +project_id = "airflow-dataproc" # Required to copy socorro json data from aws prod s3 to gcs read_aws_conn_id = "aws_socorro_readonly_s3" @@ -73,14 +72,14 @@ ) # copy json crashstats from s3 to gcs -s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( +s3_to_gcs = CloudDataTransferServiceS3ToGCSOperator( task_id="s3_to_gcs", s3_bucket="crashstats-telemetry-crashes-prod-us-west-2", + project_id=project_id, gcs_bucket=gcs_data_bucket, description="socorro crash report copy from s3 to gcs", aws_conn_id=read_aws_conn_id, gcp_conn_id=gcp_conn_id, - project_id=connection.project_id, object_conditions={"includePrefixes": "v1/crash_report/{{ ds_nodash }}"}, transfer_options={"deleteObjectsUniqueInSink": True}, timeout=3600, @@ -116,7 +115,7 @@ bq_gcp_conn_id = "google_cloud_derived_datasets" -bq_connection = GoogleCloudBaseHook(gcp_conn_id=bq_gcp_conn_id) +bq_project_id = "moz-fx-data-derived-datasets" dest_s3_key = "s3://telemetry-parquet" @@ -142,9 +141,9 @@ ] # We remove the current date partition for idempotency. -remove_bq_table_partition = BigQueryTableDeleteOperator( +remove_bq_table_partition = BigQueryDeleteTableOperator( task_id="remove_bq_table_partition", - bigquery_conn_id=bq_gcp_conn_id, + gcp_conn_id=bq_gcp_conn_id, deletion_dataset_table="{}.{}${{{{ds_nodash}}}}".format(bq_dataset, bq_table_name), ignore_if_missing=True, dag=dag, @@ -153,7 +152,7 @@ bq_load = GKEPodOperator( task_id="bigquery_load", gcp_conn_id=bq_gcp_conn_id, - project_id=bq_connection.project_id, + project_id=bq_project_id, name="load-socorro-crash-parquet-to-bq", image=docker_image, arguments=gke_args, diff --git a/dags/taar_daily.py b/dags/taar_daily.py index be94588c3..2dc8ecdc1 100644 --- a/dags/taar_daily.py +++ b/dags/taar_daily.py @@ -1,8 +1,8 @@ from datetime import datetime, timedelta from airflow import DAG -from airflow.contrib.hooks.aws_hook import AwsHook -from airflow.operators.sensors import ExternalTaskSensor +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook +from airflow.sensors.external_task import ExternalTaskSensor from airflow.operators.subdag_operator import SubDagOperator from airflow.models import Variable from itertools import chain @@ -21,14 +21,16 @@ # Dataproc connection to GCP -gcpdataproc_conn_id = "google_cloud_airflow_dataproc" +taar_gcpdataproc_conn_id = "google_cloud_airflow_dataproc" +taar_gcpdataproc_project_id = "airflow-dataproc" taar_aws_conn_id = "airflow_taar_rw_s3" -taar_aws_access_key, taar_aws_secret_key, session = AwsHook(taar_aws_conn_id).get_credentials() +taar_aws_access_key, taar_aws_secret_key, session = AwsBaseHook( + aws_conn_id=taar_aws_conn_id, client_type='s3').get_credentials() taarlite_cluster_name = "dataproc-taarlite-guidguid" taar_locale_cluster_name = "dataproc-taar-locale" taar_similarity_cluster_name = "dataproc-taar-similarity" -taar_gcpdataproc_conn_id = "google_cloud_airflow_dataproc" + default_args = { "owner": "epavlov@mozilla.com", @@ -107,7 +109,8 @@ "--prefix", "taar/locale", ], - gcp_conn_id=taar_gcpdataproc_conn_id + gcp_conn_id=taar_gcpdataproc_conn_id, + project_id=taar_gcpdataproc_project_id ), dag=dag ) @@ -136,6 +139,7 @@ "--prefix", "taar/similarity" ], gcp_conn_id=taar_gcpdataproc_conn_id, + project_id=taar_gcpdataproc_project_id, master_disk_type="pd-ssd", worker_disk_type="pd-ssd", master_disk_size=1024, @@ -176,6 +180,7 @@ init_actions_uris=[], aws_conn_id=taar_aws_conn_id, gcp_conn_id=taar_gcpdataproc_conn_id, + project_id=taar_gcpdataproc_project_id, default_args=default_args ), dag=dag, @@ -204,11 +209,13 @@ "--prefix", "taar/lite" ], gcp_conn_id=taar_gcpdataproc_conn_id, + project_id=taar_gcpdataproc_project_id, ), dag=dag, ) + taar_lite_guidranking = GKEPodOperator( task_id="taar_lite_guidranking", name="taar_lite_guidranking", @@ -230,3 +237,4 @@ wait_for_clients_daily_export >> taar_collaborative_recommender wait_for_clients_daily_export >> taar_lite wait_for_clients_daily_export >> taar_lite_guidranking + diff --git a/dags/taar_weekly.py b/dags/taar_weekly.py index 4f3c7b958..628a9f5c0 100644 --- a/dags/taar_weekly.py +++ b/dags/taar_weekly.py @@ -12,6 +12,7 @@ taar_ensemble_cluster_name = "dataproc-taar-ensemble" taar_gcpdataproc_conn_id = "google_cloud_airflow_dataproc" +taar_gcpdataproc_project_id = "airflow-dataproc" TAAR_BIGTABLE_INSTANCE_ID = Variable.get("taar_bigtable_instance_id") TAAR_ETL_STORAGE_BUCKET = Variable.get("taar_etl_storage_bucket") @@ -181,6 +182,7 @@ def taar_profile_common_args(): "0.005", ], gcp_conn_id=taar_gcpdataproc_conn_id, + project_id=taar_gcpdataproc_project_id, master_disk_type="pd-ssd", worker_disk_type="pd-ssd", master_disk_size=1024, diff --git a/dags/update_orphaning_dashboard_etl.py b/dags/update_orphaning_dashboard_etl.py index 4b97f1d4a..222ba377b 100644 --- a/dags/update_orphaning_dashboard_etl.py +++ b/dags/update_orphaning_dashboard_etl.py @@ -1,6 +1,5 @@ from airflow import DAG -from airflow.contrib.hooks.aws_hook import AwsHook -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook from airflow.operators.subdag_operator import SubDagOperator from datetime import datetime, timedelta @@ -33,11 +32,11 @@ # Defined in Airflow's UI -> Admin -> Connections gcp_conn_id = 'google_cloud_airflow_dataproc' -connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) # Required to write json output back to s3://telemetry-public-analysis-2/app-update/data/out-of-date/ write_aws_conn_id='aws_dev_telemetry_public_analysis_2_rw' -aws_access_key, aws_secret_key, session = AwsHook(write_aws_conn_id).get_credentials() +aws_access_key, aws_secret_key, session = AwsBaseHook( + aws_conn_id=write_aws_conn_id, client_type='s3').get_credentials() crash_report_parquet = SubDagOperator( task_id="update_orphaning_dashboard_etl", diff --git a/dags/utils/dataproc.py b/dags/utils/dataproc.py index 0ab795184..d35b47751 100644 --- a/dags/utils/dataproc.py +++ b/dags/utils/dataproc.py @@ -3,21 +3,21 @@ from collections import namedtuple from airflow import models -from airflow.contrib.hooks.aws_hook import AwsHook -from airflow.operators.bash_operator import BashOperator -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from airflow.contrib.operators.dataproc_operator import DataprocClusterDeleteOperator, DataProcSparkOperator, DataProcPySparkOperator from airflow.exceptions import AirflowException -from airflow.utils.trigger_rule import TriggerRule - -# Our own dataproc operator used to install component gateway -from operators.moz_dataproc_operator import DataprocClusterCreateOperator - -""" -Note: We are currently on 1.10.7 and when we upgrade, the spark operators will move. -This module is deprecated. Please use `airflow.providers.google.cloud.operators.dataproc -""" - +from airflow.operators.bash_operator import BashOperator +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook + +# When google deprecates dataproc_v1beta2 in DataprocHook/Operator classes +# We can import these from our patched code, rather than upgrading/deploying +# apache-airflow-providers-google > 6.0.0, and google-cloud-dataproc > 2.5.0 +# from utils.patched.dataproc_operator import ( +from airflow.providers.google.cloud.operators.dataproc import ( + ClusterGenerator, + DataprocCreateClusterOperator, + DataprocDeleteClusterOperator, + DataprocSubmitPySparkJobOperator, + DataprocSubmitSparkJobOperator, +) class DataProcHelper: """ @@ -32,8 +32,8 @@ def __init__(self, region='us-west1', subnetwork_uri=None, internal_ip_only=None, - idle_delete_ttl='14400', - auto_delete_ttl='28800', + idle_delete_ttl=14400, + auto_delete_ttl=28800, master_machine_type='n1-standard-8', worker_machine_type='n1-standard-4', num_preemptible_workers=0, @@ -45,6 +45,7 @@ def __init__(self, install_component_gateway=True, aws_conn_id=None, gcp_conn_id='google_cloud_airflow_dataproc', + project_id='airflow-dataproc', artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts', storage_bucket='moz-fx-data-prod-dataproc-scratch', master_disk_type='pd-standard', @@ -99,12 +100,11 @@ def __init__(self, self.install_component_gateway = install_component_gateway self.aws_conn_id = aws_conn_id self.gcp_conn_id = gcp_conn_id - - self.connection = GoogleCloudBaseHook(gcp_conn_id=self.gcp_conn_id) + self.project_id = project_id def create_cluster(self): """ - Returns a DataprocClusterCreateOperator + Returns a DataprocCreateClusterOperator """ properties = {} @@ -115,7 +115,7 @@ def create_cluster(self): if self.aws_conn_id: for key, value in zip( ("access.key", "secret.key", "session.token"), - AwsHook(self.aws_conn_id).get_credentials(), + AwsBaseHook(aws_conn_id=self.aws_conn_id, client_type='s3').get_credentials(), ): if value is not None: properties["core:fs.s3a." + key] = value @@ -133,48 +133,71 @@ def create_cluster(self): } metadata.update(self.additional_metadata) - return DataprocClusterCreateOperator( - task_id='create_dataproc_cluster', - cluster_name=self.cluster_name, - job_name=self.job_name, - gcp_conn_id=self.gcp_conn_id, - service_account=self.service_account, - project_id=self.connection.project_id, + cluster_generator = ClusterGenerator( + project_id = self.project_id, + num_workers = self.num_workers, + subnetwork_uri = self.subnetwork_uri, + internal_ip_only = self.internal_ip_only, storage_bucket=self.storage_bucket, - num_workers=self.num_workers, + init_actions_uris=self.init_actions_uris, + metadata = metadata, image_version=self.image_version, - properties=properties, - region=self.region, - subnetwork_uri=self.subnetwork_uri, - internal_ip_only=self.internal_ip_only, - idle_delete_ttl=self.idle_delete_ttl, - auto_delete_ttl=self.auto_delete_ttl, - master_machine_type=self.master_machine_type, - worker_machine_type=self.worker_machine_type, - num_preemptible_workers=self.num_preemptible_workers, + properties = properties, optional_components = self.optional_components, - install_component_gateway = self.install_component_gateway, - init_actions_uris=self.init_actions_uris, + master_machine_type=self.master_machine_type, master_disk_type=self.master_disk_type, master_disk_size=self.master_disk_size, + worker_machine_type=self.worker_machine_type, worker_disk_type=self.worker_disk_type, worker_disk_size=self.worker_disk_size, - master_num_local_ssds=self.master_num_local_ssds, - worker_num_local_ssds=self.worker_num_local_ssds, - metadata=metadata, + num_preemptible_workers=self.num_preemptible_workers, + service_account=self.service_account, + idle_delete_ttl=self.idle_delete_ttl, + auto_delete_ttl=self.auto_delete_ttl + ) + + cluster_config = cluster_generator.make() + + # The DataprocCreateClusterOperator and ClusterGenerator dont support component gateway or local ssds + # ClusterConfig format is + # https://cloud.google.com/dataproc/docs/reference/rpc/google.cloud.dataproc.v1#google.cloud.dataproc.v1.ClusterConfig + if self.install_component_gateway: + cluster_config.update({'endpoint_config' : {'enable_http_port_access' : True}}) + + if self.master_num_local_ssds > 0: + master_instance_group_config = cluster_config['master_config'] + master_instance_group_config['disk_config']['num_local_ssds'] = self.master_num_local_ssds + cluster_config.update({'master_config' : master_instance_group_config}) + + if self.worker_num_local_ssds > 0: + worker_instance_group_config = cluster_config['worker_config'] + worker_instance_group_config['disk_config']['num_local_ssds'] =self.worker_num_local_ssds + cluster_config.update({'worker_config' : worker_instance_group_config}) + + return DataprocCreateClusterOperator( + task_id='create_dataproc_cluster', + cluster_name=self.cluster_name, + project_id = self.project_id, + use_if_exists=True, + delete_on_error=True, + labels={ 'env': os.getenv('DEPLOY_ENVIRONMENT', 'env_not_set'), + 'owner': os.getenv('AIRFLOW_CTX_DAG_OWNER', 'owner_not_set'), + 'jobname': self.job_name.lower().replace('_', '-') }, + gcp_conn_id=self.gcp_conn_id, + region=self.region, + cluster_config = cluster_config ) def delete_cluster(self): """ - Returns a DataprocClusterDeleteOperator + Returns a DataprocDeleteClusterOperator """ - return DataprocClusterDeleteOperator( + return DataprocDeleteClusterOperator( task_id='delete_dataproc_cluster', - trigger_rule=TriggerRule.ALL_DONE, cluster_name=self.cluster_name, region=self.region, gcp_conn_id=self.gcp_conn_id, - project_id=self.connection.project_id) + project_id=self.project_id) # End DataProcHelper @@ -187,8 +210,8 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None, region='us-west1', subnetwork_uri=None, internal_ip_only=None, - idle_delete_ttl='10800', - auto_delete_ttl='21600', + idle_delete_ttl=10800, + auto_delete_ttl=21600, master_machine_type='n1-standard-8', worker_machine_type='n1-standard-4', num_preemptible_workers=0, @@ -203,6 +226,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None, job_name=None, aws_conn_id=None, gcp_conn_id='google_cloud_airflow_dataproc', + project_id='airflow-dataproc', artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts', storage_bucket='moz-fx-data-prod-dataproc-scratch', master_disk_type='pd-standard', @@ -215,7 +239,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None, """ This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway. - Then we call DataProcPySparkOperator to execute the pyspark script defined by the argument + Then we call DataprocSubmitPySparkJobOperator to execute the pyspark script defined by the argument python_driver_code. Once that succeeds, we teardown the cluster. **Example**: :: @@ -281,6 +305,9 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None, :param str aws_conn_id: Airflow connection id for S3 access (if needed). :param str gcp_conn_id: The connection ID to use connecting to GCP. + :param str project_id: The project ID corresponding to the gcp_conn_id. We + add this because the dev environment doesn't parse it out + correctly from the dummy connections. :param str artifact_bucket: Path to resources for bootstrapping the dataproc cluster :param str storage_bucket: Path to scratch bucket for intermediate cluster results :param list optional_components: List of optional components to install on cluster @@ -338,6 +365,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None, install_component_gateway=install_component_gateway, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id, + project_id=project_id, artifact_bucket=artifact_bucket, storage_bucket=storage_bucket, master_disk_type=master_disk_type, @@ -353,7 +381,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None, with models.DAG(_dag_name, default_args=default_args) as dag: create_dataproc_cluster = dataproc_helper.create_cluster() - run_pyspark_on_dataproc = DataProcPySparkOperator( + run_pyspark_on_dataproc = DataprocSubmitPySparkJobOperator( task_id='run_dataproc_pyspark', job_name=job_name, cluster_name=cluster_name, @@ -361,6 +389,7 @@ def moz_dataproc_pyspark_runner(parent_dag_name=None, main=python_driver_code, arguments=py_args, gcp_conn_id=gcp_conn_id, + project_id=project_id ) delete_dataproc_cluster = dataproc_helper.delete_cluster() @@ -379,8 +408,8 @@ def moz_dataproc_jar_runner(parent_dag_name=None, region='us-west1', subnetwork_uri=None, internal_ip_only=None, - idle_delete_ttl='14400', - auto_delete_ttl='28800', + idle_delete_ttl=14400, + auto_delete_ttl=28800, master_machine_type='n1-standard-8', worker_machine_type='n1-standard-4', num_preemptible_workers=0, @@ -394,6 +423,7 @@ def moz_dataproc_jar_runner(parent_dag_name=None, job_name=None, aws_conn_id=None, gcp_conn_id='google_cloud_airflow_dataproc', + project_id='airflow-dataproc', master_disk_type='pd-standard', worker_disk_type='pd-standard', master_disk_size=1024, @@ -404,7 +434,7 @@ def moz_dataproc_jar_runner(parent_dag_name=None, """ This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway. - Then we call DataProcSparkOperator to execute the jar defined by the arguments + Then we call DataprocSubmitSparkJobOperator to execute the jar defined by the arguments jar_urls and main_class. Once that succeeds, we teardown the cluster. **Example**: :: @@ -468,6 +498,7 @@ def moz_dataproc_jar_runner(parent_dag_name=None, install_component_gateway=install_component_gateway, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id, + project_id=project_id, master_disk_type=master_disk_type, master_disk_size=master_disk_size, worker_disk_type=worker_disk_type, @@ -481,15 +512,17 @@ def moz_dataproc_jar_runner(parent_dag_name=None, with models.DAG(_dag_name, default_args=default_args) as dag: create_dataproc_cluster = dataproc_helper.create_cluster() - run_jar_on_dataproc = DataProcSparkOperator( + run_jar_on_dataproc = DataprocSubmitSparkJobOperator( cluster_name=cluster_name, region=region, task_id='run_jar_on_dataproc', job_name=job_name, - dataproc_spark_jars=jar_urls, + dataproc_jars=jar_urls, main_class=main_class, arguments=jar_args, - gcp_conn_id=gcp_conn_id) + gcp_conn_id=gcp_conn_id, + project_id=project_id + ) delete_dataproc_cluster = dataproc_helper.delete_cluster() @@ -512,8 +545,8 @@ def moz_dataproc_scriptrunner(parent_dag_name=None, region='us-west1', subnetwork_uri=None, internal_ip_only=None, - idle_delete_ttl='14400', - auto_delete_ttl='28800', + idle_delete_ttl=14400, + auto_delete_ttl=28800, master_machine_type='n1-standard-8', worker_machine_type='n1-standard-4', num_preemptible_workers=0, @@ -527,6 +560,7 @@ def moz_dataproc_scriptrunner(parent_dag_name=None, job_name=None, aws_conn_id=None, gcp_conn_id='google_cloud_airflow_dataproc', + project_id='airflow-dataproc', master_disk_type='pd-standard', worker_disk_type='pd-standard', master_disk_size=1024, @@ -538,7 +572,7 @@ def moz_dataproc_scriptrunner(parent_dag_name=None, """ This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway. Then we execute a script uri (either https or gcs) similar to how we use our custom AWS - EmrSparkOperator. This will call DataProcSparkOperator using EMR's script-runner.jar, which + EmrSparkOperator. This will call DataprocSubmitSparkJobOperator using EMR's script-runner.jar, which then executes the airflow_gcp.sh entrypoint script. The entrypoint script expects another script uri, along with it's arguments, as parameters. Once that succeeds, we teardown the cluster. @@ -609,6 +643,7 @@ def moz_dataproc_scriptrunner(parent_dag_name=None, install_component_gateway=install_component_gateway, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id, + project_id=project_id, master_disk_type=master_disk_type, master_disk_size=master_disk_size, worker_disk_type=worker_disk_type, @@ -636,17 +671,19 @@ def moz_dataproc_scriptrunner(parent_dag_name=None, with models.DAG(_dag_name, default_args=default_args) as dag: create_dataproc_cluster = dataproc_helper.create_cluster() - # Run DataprocSparkOperator with script-runner.jar pointing to airflow_gcp.sh. + # Run DataprocSubmitSparkJobOperator with script-runner.jar pointing to airflow_gcp.sh. - run_script_on_dataproc = DataProcSparkOperator( + run_script_on_dataproc = DataprocSubmitSparkJobOperator( cluster_name=cluster_name, region=region, task_id='run_script_on_dataproc', job_name=job_name, - dataproc_spark_jars=[jar_url], + dataproc_jars=[jar_url], main_class='com.amazon.elasticmapreduce.scriptrunner.ScriptRunner', arguments=args, - gcp_conn_id=gcp_conn_id) + gcp_conn_id=gcp_conn_id, + project_id=project_id + ) delete_dataproc_cluster = dataproc_helper.delete_cluster() @@ -715,13 +752,13 @@ def get_dataproc_parameters(conn_id="google_cloud_airflow_dataproc"): and should either be the production default ("dataproc-runner-prod"), or a service key associated with a sandbox account. """ - gcp_conn = GoogleCloudBaseHook(conn_id) - keyfile = json.loads(gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"]) - - project_id = keyfile["project_id"] + dev_project_id = "replace_me" + dev_client_email = "replace_me" + is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev" + project_id = "airflow-dataproc" if is_dev else dev_project_id client_email = ( - keyfile["client_email"] + dev_client_email if is_dev else "dataproc-runner-prod@airflow-dataproc.iam.gserviceaccount.com" ) diff --git a/dags/utils/forecasting.py b/dags/utils/forecasting.py index ab5a1d309..7bd64a0b5 100644 --- a/dags/utils/forecasting.py +++ b/dags/utils/forecasting.py @@ -1,14 +1,12 @@ from operators.gcp_container_operator import GKEPodOperator -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook - def simpleprophet_forecast( task_id, datasource, - project_id, dataset_id, table_id, gcp_conn_id="google_cloud_derived_datasets", + project_id='moz-fx-data-derived-datasets', gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", @@ -25,6 +23,7 @@ def simpleprophet_forecast( :param str table_id: [Required] ID of target table :param str gcp_conn_id: Airflow connection id for GCP access + :param str project_id: GCP project id associated with gcp_conn_id :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace @@ -40,7 +39,7 @@ def simpleprophet_forecast( return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, - project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, + project_id=project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, diff --git a/dags/utils/gcp.py b/dags/utils/gcp.py index e722806bd..4d7694a8c 100644 --- a/dags/utils/gcp.py +++ b/dags/utils/gcp.py @@ -1,20 +1,25 @@ from airflow import models -from airflow.utils import trigger_rule from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator -from airflow.contrib.hooks.aws_hook import AwsHook -from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from airflow.contrib.operators.dataproc_operator import DataprocClusterCreateOperator, DataprocClusterDeleteOperator, DataProcSparkOperator, DataProcPySparkOperator # noqa from operators.gcp_container_operator import GKEPodOperator -from airflow.contrib.operators.bigquery_table_delete_operator import BigQueryTableDeleteOperator # noqa:E501 -from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator -from airflow.contrib.operators.gcp_transfer_operator import S3ToGoogleCloudStorageTransferOperator # noqa:E501 -from airflow.contrib.operators.gcs_delete_operator import GoogleCloudStorageDeleteOperator + +from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook + +from airflow.providers.google.cloud.operators.dataproc import ( + DataprocCreateClusterOperator, + DataprocDeleteClusterOperator, + DataprocSubmitPySparkJobOperator, +) + +from airflow.providers.google.cloud.transfers.bigquery_to_gcs import BigQueryToGCSOperator + +from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator import json import re +GCP_PROJECT_ID = "moz-fx-data-derived-datasets" def export_to_parquet( table, @@ -67,7 +72,7 @@ def export_to_parquet( cluster_name += "-export-{{ ds_nodash }}" dag_prefix = parent_dag_name + "." if parent_dag_name else "" - connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) + project_id = GCP_PROJECT_ID if destination_table is None: destination_table = unqualified_table @@ -82,11 +87,12 @@ def export_to_parquet( with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag: - create_dataproc_cluster = DataprocClusterCreateOperator( + create_dataproc_cluster = DataprocCreateClusterOperator( task_id="create_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, - project_id=connection.project_id, + region="us-west1", + project_id=project_id, num_workers=num_workers, image_version="1.4", storage_bucket=dataproc_storage_bucket, @@ -100,13 +106,13 @@ def export_to_parquet( metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"}, ) - run_dataproc_pyspark = DataProcPySparkOperator( + run_dataproc_pyspark = DataprocSubmitPySparkJobOperator( task_id="run_dataproc_pyspark", cluster_name=cluster_name, - dataproc_pyspark_jars=[ + dataproc_jars=[ "gs://spark-lib/bigquery/spark-bigquery-latest.jar" ], - dataproc_pyspark_properties={ + dataproc_properties={ "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4", }, main="https://raw.githubusercontent.com/mozilla/bigquery-etl/main" @@ -125,31 +131,33 @@ def export_to_parquet( + [static_partitions] + arguments, gcp_conn_id=gcp_conn_id, + project_id=project_id, ) - delete_dataproc_cluster = DataprocClusterDeleteOperator( + delete_dataproc_cluster = DataprocDeleteClusterOperator( task_id="delete_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, - project_id=connection.project_id, - trigger_rule=trigger_rule.TriggerRule.ALL_DONE, + project_id=project_id, + trigger_rule="all_done", + region="us-west1", ) if not use_storage_api: - avro_export = BigQueryToCloudStorageOperator( + avro_export = BigQueryToGCSOperator( task_id="avro_export", source_project_dataset_table=table, destination_cloud_storage_uris=avro_path, compression=None, export_format="AVRO", - bigquery_conn_id=gcp_conn_id, + gcp_conn_id=gcp_conn_id, ) - avro_delete = GoogleCloudStorageDeleteOperator( + avro_delete = GCSDeleteObjectsOperator( task_id="avro_delete", bucket_name=gcs_output_bucket, prefix=avro_prefix, - google_cloud_storage_conn_id=gcp_conn_id, - trigger_rule=trigger_rule.TriggerRule.ALL_DONE, + gcp_conn_id=gcp_conn_id, + trigger_rule="all_done", ) avro_export >> run_dataproc_pyspark >> avro_delete @@ -210,7 +218,7 @@ def bigquery_etl_query( parameters += (date_partition_parameter + ":DATE:{{ds}}",) return GKEPodOperator( gcp_conn_id=gcp_conn_id, - project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, + project_id=project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, @@ -278,7 +286,7 @@ def bigquery_etl_copy_deduplicate( return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, - project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, + project_id=GCP_PROJECT_ID, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, @@ -341,7 +349,7 @@ def bigquery_xcom_query( query = "{{ " + "task_instance.xcom_pull({!r})".format(xcom_task_id) + " }}" return GKEPodOperator( gcp_conn_id=gcp_conn_id, - project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, + project_id=GCP_PROJECT_ID, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, @@ -407,7 +415,7 @@ def gke_command( key: value for key, value in zip( ("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"), - AwsHook(aws_conn_id).get_credentials() if aws_conn_id else (), + AwsBaseHook(aws_conn_id=aws_conn_id, client_type='s3').get_credentials() if aws_conn_id else (), ) if value is not None} context_env_vars["XCOM_PUSH"] = json.dumps(xcom_push) @@ -416,7 +424,7 @@ def gke_command( return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, - project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, + project_id=GCP_PROJECT_ID, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, diff --git a/dags/utils/patched/__init__.py b/dags/utils/patched/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dags/utils/patched/dataproc_hook.py b/dags/utils/patched/dataproc_hook.py new file mode 100644 index 000000000..e72caf292 --- /dev/null +++ b/dags/utils/patched/dataproc_hook.py @@ -0,0 +1,1047 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +"""This module contains a Google Cloud Dataproc hook.""" + +""" +I have copy pasted this from the providers-google/5.0.0 branch +https://github.com/apache/airflow/blob/providers-google/5.0.0/airflow/providers/google/cloud/hooks/dataproc.py + +This changes all imports from dataproc_v1beta2 to dataproc_v1 +We import DataProcHook/DataProcJobBuilder from here into our +other patched file dataproc_operator.py which contains Operators +for use with our utils/dataproc.py wrapper +code. See the other patched file for more reasons why. + +We replaced all instances in this file of dataproc_v1beta2 with dataproc_v1 +""" + +import time +import uuid +import warnings +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union + +from google.api_core.exceptions import ServerError +from google.api_core.retry import Retry +# Moz specific - import has changed +from google.cloud.dataproc_v1 import ( + Cluster, + ClusterControllerClient, + Job, + JobControllerClient, + JobStatus, + WorkflowTemplate, + WorkflowTemplateServiceClient, +) +from google.protobuf.duration_pb2 import Duration +from google.protobuf.field_mask_pb2 import FieldMask + +from airflow.exceptions import AirflowException +from airflow.providers.google.common.hooks.base_google import GoogleBaseHook +from airflow.version import version as airflow_version + + +class DataProcJobBuilder: + """A helper class for building Dataproc job.""" + + def __init__( + self, + project_id: str, + task_id: str, + cluster_name: str, + job_type: str, + properties: Optional[Dict[str, str]] = None, + ) -> None: + name = task_id + "_" + str(uuid.uuid4())[:8] + self.job_type = job_type + self.job = { + "job": { + "reference": {"project_id": project_id, "job_id": name}, + "placement": {"cluster_name": cluster_name}, + "labels": {'airflow-version': 'v' + airflow_version.replace('.', '-').replace('+', '-')}, + job_type: {}, + } + } # type: Dict[str, Any] + if properties is not None: + self.job["job"][job_type]["properties"] = properties + + def add_labels(self, labels: dict) -> None: + """ + Set labels for Dataproc job. + + :param labels: Labels for the job query. + :type labels: dict + """ + if labels: + self.job["job"]["labels"].update(labels) + + def add_variables(self, variables: List[str]) -> None: + """ + Set variables for Dataproc job. + + :param variables: Variables for the job query. + :type variables: List[str] + """ + if variables is not None: + self.job["job"][self.job_type]["script_variables"] = variables + + def add_args(self, args: List[str]) -> None: + """ + Set args for Dataproc job. + + :param args: Args for the job query. + :type args: List[str] + """ + if args is not None: + self.job["job"][self.job_type]["args"] = args + + def add_query(self, query: List[str]) -> None: + """ + Set query uris for Dataproc job. + + :param query: URIs for the job queries. + :type query: List[str] + """ + self.job["job"][self.job_type]["query_list"] = {'queries': [query]} + + def add_query_uri(self, query_uri: str) -> None: + """ + Set query uri for Dataproc job. + + :param query_uri: URI for the job query. + :type query_uri: str + """ + self.job["job"][self.job_type]["query_file_uri"] = query_uri + + def add_jar_file_uris(self, jars: List[str]) -> None: + """ + Set jars uris for Dataproc job. + + :param jars: List of jars URIs + :type jars: List[str] + """ + if jars is not None: + self.job["job"][self.job_type]["jar_file_uris"] = jars + + def add_archive_uris(self, archives: List[str]) -> None: + """ + Set archives uris for Dataproc job. + + :param archives: List of archives URIs + :type archives: List[str] + """ + if archives is not None: + self.job["job"][self.job_type]["archive_uris"] = archives + + def add_file_uris(self, files: List[str]) -> None: + """ + Set file uris for Dataproc job. + + :param files: List of files URIs + :type files: List[str] + """ + if files is not None: + self.job["job"][self.job_type]["file_uris"] = files + + def add_python_file_uris(self, pyfiles: List[str]) -> None: + """ + Set python file uris for Dataproc job. + + :param pyfiles: List of python files URIs + :type pyfiles: List[str] + """ + if pyfiles is not None: + self.job["job"][self.job_type]["python_file_uris"] = pyfiles + + def set_main(self, main_jar: Optional[str], main_class: Optional[str]) -> None: + """ + Set Dataproc main class. + + :param main_jar: URI for the main file. + :type main_jar: str + :param main_class: Name of the main class. + :type main_class: str + :raises: Exception + """ + if main_class is not None and main_jar is not None: + raise Exception("Set either main_jar or main_class") + if main_jar: + self.job["job"][self.job_type]["main_jar_file_uri"] = main_jar + else: + self.job["job"][self.job_type]["main_class"] = main_class + + def set_python_main(self, main: str) -> None: + """ + Set Dataproc main python file uri. + + :param main: URI for the python main file. + :type main: str + """ + self.job["job"][self.job_type]["main_python_file_uri"] = main + + def set_job_name(self, name: str) -> None: + """ + Set Dataproc job name. + + :param name: Job name. + :type name: str + """ + self.job["job"]["reference"]["job_id"] = name + "_" + str(uuid.uuid4())[:8] + + def build(self) -> Dict: + """ + Returns Dataproc job. + + :return: Dataproc job + :rtype: dict + """ + return self.job + + +class DataprocHook(GoogleBaseHook): + """ + Hook for Google Cloud Dataproc APIs. + + All the methods in the hook where project_id is used must be called with + keyword arguments rather than positional. + """ + + def get_cluster_client( + self, region: Optional[str] = None, location: Optional[str] = None + ) -> ClusterControllerClient: + """Returns ClusterControllerClient.""" + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + client_options = None + if region and region != 'global': + client_options = {'api_endpoint': f'{region}-dataproc.googleapis.com:443'} + + return ClusterControllerClient( + credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options + ) + + def get_template_client( + self, region: Optional[str] = None, location: Optional[str] = None + ) -> WorkflowTemplateServiceClient: + """Returns WorkflowTemplateServiceClient.""" + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + client_options = None + if region and region != 'global': + client_options = {'api_endpoint': f'{region}-dataproc.googleapis.com:443'} + + return WorkflowTemplateServiceClient( + credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options + ) + + def get_job_client( + self, region: Optional[str] = None, location: Optional[str] = None + ) -> JobControllerClient: + """Returns JobControllerClient.""" + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + client_options = None + if region and region != 'global': + client_options = {'api_endpoint': f'{region}-dataproc.googleapis.com:443'} + + return JobControllerClient( + credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options + ) + + @GoogleBaseHook.fallback_to_default_project_id + def create_cluster( + self, + region: str, + project_id: str, + cluster_name: str, + cluster_config: Union[Dict, Cluster], + labels: Optional[Dict[str, str]] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Creates a cluster in a project. + + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param cluster_name: Name of the cluster to create + :type cluster_name: str + :param labels: Labels that will be assigned to created cluster + :type labels: Dict[str, str] + :param cluster_config: Required. The cluster config to create. + If a dict is provided, it must be of the same form as the protobuf message + :class:`~google.cloud.dataproc_v1.types.ClusterConfig` + :type cluster_config: Union[Dict, google.cloud.dataproc_v1.types.ClusterConfig] + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``CreateClusterRequest`` requests with the same id, then the second request will be ignored and + the first ``google.longrunning.Operation`` created and stored in the backend is returned. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + # Dataproc labels must conform to the following regex: + # [a-z]([-a-z0-9]*[a-z0-9])? (current airflow version string follows + # semantic versioning spec: x.y.z). + labels = labels or {} + labels.update({'airflow-version': 'v' + airflow_version.replace('.', '-').replace('+', '-')}) + + cluster = { + "project_id": project_id, + "cluster_name": cluster_name, + "config": cluster_config, + "labels": labels, + } + + client = self.get_cluster_client(region=region) + result = client.create_cluster( + request={ + 'project_id': project_id, + 'region': region, + 'cluster': cluster, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def delete_cluster( + self, + region: str, + cluster_name: str, + project_id: str, + cluster_uuid: Optional[str] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Deletes a cluster in a project. + + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param cluster_name: Required. The cluster name. + :type cluster_name: str + :param cluster_uuid: Optional. Specifying the ``cluster_uuid`` means the RPC should fail + if cluster with specified UUID does not exist. + :type cluster_uuid: str + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``DeleteClusterRequest`` requests with the same id, then the second request will be ignored and + the first ``google.longrunning.Operation`` created and stored in the backend is returned. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + client = self.get_cluster_client(region=region) + result = client.delete_cluster( + request={ + 'project_id': project_id, + 'region': region, + 'cluster_name': cluster_name, + 'cluster_uuid': cluster_uuid, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def diagnose_cluster( + self, + region: str, + cluster_name: str, + project_id: str, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Gets cluster diagnostic information. After the operation completes GCS uri to + diagnose is returned + + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param cluster_name: Required. The cluster name. + :type cluster_name: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + client = self.get_cluster_client(region=region) + operation = client.diagnose_cluster( + request={'project_id': project_id, 'region': region, 'cluster_name': cluster_name}, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + operation.result() + gcs_uri = str(operation.operation.response.value) + return gcs_uri + + @GoogleBaseHook.fallback_to_default_project_id + def get_cluster( + self, + region: str, + cluster_name: str, + project_id: str, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Gets the resource representation for a cluster in a project. + + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param cluster_name: Required. The cluster name. + :type cluster_name: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + client = self.get_cluster_client(region=region) + result = client.get_cluster( + request={'project_id': project_id, 'region': region, 'cluster_name': cluster_name}, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def list_clusters( + self, + region: str, + filter_: str, + project_id: str, + page_size: Optional[int] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Lists all regions/{region}/clusters in a project. + + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param filter_: Optional. A filter constraining the clusters to list. Filters are case-sensitive. + :type filter_: str + :param page_size: The maximum number of resources contained in the underlying API response. If page + streaming is performed per- resource, this parameter does not affect the return value. If page + streaming is performed per-page, this determines the maximum number of resources in a page. + :type page_size: int + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + client = self.get_cluster_client(region=region) + result = client.list_clusters( + request={'project_id': project_id, 'region': region, 'filter': filter_, 'page_size': page_size}, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def update_cluster( + self, + cluster_name: str, + cluster: Union[Dict, Cluster], + update_mask: Union[Dict, FieldMask], + project_id: str, + region: str = None, + location: Optional[str] = None, + graceful_decommission_timeout: Optional[Union[Dict, Duration]] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Updates a cluster in a project. + + :param project_id: Required. The ID of the Google Cloud project the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param cluster_name: Required. The cluster name. + :type cluster_name: str + :param cluster: Required. The changes to the cluster. + + If a dict is provided, it must be of the same form as the protobuf message + :class:`~google.cloud.dataproc_v1.types.Cluster` + :type cluster: Union[Dict, google.cloud.dataproc_v1.types.Cluster] + :param update_mask: Required. Specifies the path, relative to ``Cluster``, of the field to update. For + example, to change the number of workers in a cluster to 5, the ``update_mask`` parameter would be + specified as ``config.worker_config.num_instances``, and the ``PATCH`` request body would specify + the new value, as follows: + + :: + + { "config":{ "workerConfig":{ "numInstances":"5" } } } + + Similarly, to change the number of preemptible workers in a cluster to 5, the ``update_mask`` + parameter would be ``config.secondary_worker_config.num_instances``, and the ``PATCH`` request + body would be set as follows: + + :: + + { "config":{ "secondaryWorkerConfig":{ "numInstances":"5" } } } + + If a dict is provided, it must be of the same form as the protobuf message + :class:`~google.cloud.dataproc_v1.types.FieldMask` + :type update_mask: Union[Dict, google.cloud.dataproc_v1.types.FieldMask] + :param graceful_decommission_timeout: Optional. Timeout for graceful YARN decommissioning. Graceful + decommissioning allows removing nodes from the cluster without interrupting jobs in progress. + Timeout specifies how long to wait for jobs in progress to finish before forcefully removing nodes + (and potentially interrupting jobs). Default timeout is 0 (for forceful decommission), and the + maximum allowed timeout is 1 day. + + Only supported on Dataproc image versions 1.2 and higher. + + If a dict is provided, it must be of the same form as the protobuf message + :class:`~google.cloud.dataproc_v1.types.Duration` + :type graceful_decommission_timeout: Union[Dict, google.cloud.dataproc_v1.types.Duration] + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``UpdateClusterRequest`` requests with the same id, then the second request will be ignored and + the first ``google.longrunning.Operation`` created and stored in the backend is returned. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + else: + raise TypeError("missing 1 required keyword argument: 'region'") + client = self.get_cluster_client(region=region) + operation = client.update_cluster( + request={ + 'project_id': project_id, + 'region': region, + 'cluster_name': cluster_name, + 'cluster': cluster, + 'update_mask': update_mask, + 'graceful_decommission_timeout': graceful_decommission_timeout, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return operation + + @GoogleBaseHook.fallback_to_default_project_id + def create_workflow_template( + self, + template: Union[Dict, WorkflowTemplate], + project_id: str, + region: str = None, + location: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ) -> WorkflowTemplate: + """ + Creates new workflow template. + + :param project_id: Required. The ID of the Google Cloud project the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param template: The Dataproc workflow template to create. If a dict is provided, + it must be of the same form as the protobuf message WorkflowTemplate. + :type template: Union[dict, WorkflowTemplate] + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + else: + raise TypeError("missing 1 required keyword argument: 'region'") + metadata = metadata or () + client = self.get_template_client(region) + parent = f'projects/{project_id}/regions/{region}' + return client.create_workflow_template( + request={'parent': parent, 'template': template}, retry=retry, timeout=timeout, metadata=metadata + ) + + @GoogleBaseHook.fallback_to_default_project_id + def instantiate_workflow_template( + self, + template_name: str, + project_id: str, + region: str = None, + location: Optional[str] = None, + version: Optional[int] = None, + request_id: Optional[str] = None, + parameters: Optional[Dict[str, str]] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): # pylint: disable=too-many-arguments + """ + Instantiates a template and begins execution. + + :param template_name: Name of template to instantiate. + :type template_name: str + :param project_id: Required. The ID of the Google Cloud project the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param version: Optional. The version of workflow template to instantiate. If specified, + the workflow will be instantiated only if the current version of + the workflow template has the supplied version. + This option cannot be used to instantiate a previous version of + workflow template. + :type version: int + :param request_id: Optional. A tag that prevents multiple concurrent workflow instances + with the same tag from running. This mitigates risk of concurrent + instances started due to retries. + :type request_id: str + :param parameters: Optional. Map from parameter names to values that should be used for those + parameters. Values may not exceed 100 characters. + :type parameters: Dict[str, str] + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + else: + raise TypeError("missing 1 required keyword argument: 'region'") + metadata = metadata or () + client = self.get_template_client(region) + name = f'projects/{project_id}/regions/{region}/workflowTemplates/{template_name}' + operation = client.instantiate_workflow_template( + request={'name': name, 'version': version, 'request_id': request_id, 'parameters': parameters}, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return operation + + @GoogleBaseHook.fallback_to_default_project_id + def instantiate_inline_workflow_template( + self, + template: Union[Dict, WorkflowTemplate], + project_id: str, + region: str = None, + location: Optional[str] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Instantiates a template and begins execution. + + :param template: The workflow template to instantiate. If a dict is provided, + it must be of the same form as the protobuf message WorkflowTemplate + :type template: Union[Dict, WorkflowTemplate] + :param project_id: Required. The ID of the Google Cloud project the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param request_id: Optional. A tag that prevents multiple concurrent workflow instances + with the same tag from running. This mitigates risk of concurrent + instances started due to retries. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + else: + raise TypeError("missing 1 required keyword argument: 'region'") + metadata = metadata or () + client = self.get_template_client(region) + parent = f'projects/{project_id}/regions/{region}' + operation = client.instantiate_inline_workflow_template( + request={'parent': parent, 'template': template, 'request_id': request_id}, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return operation + + @GoogleBaseHook.fallback_to_default_project_id + def wait_for_job( + self, + job_id: str, + project_id: str, + wait_time: int = 10, + region: str = None, + location: Optional[str] = None, + timeout: Optional[int] = None, + ) -> None: + """ + Helper method which polls a job to check if it finishes. + + :param job_id: Id of the Dataproc job + :type job_id: str + :param project_id: Required. The ID of the Google Cloud project the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param wait_time: Number of seconds between checks + :type wait_time: int + :param timeout: How many seconds wait for job to be ready. Used only if ``asynchronous`` is False + :type timeout: int + """ + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + else: + raise TypeError("missing 1 required keyword argument: 'region'") + state = None + start = time.monotonic() + while state not in (JobStatus.State.ERROR, JobStatus.State.DONE, JobStatus.State.CANCELLED): + if timeout and start + timeout < time.monotonic(): + raise AirflowException(f"Timeout: dataproc job {job_id} is not ready after {timeout}s") + time.sleep(wait_time) + try: + job = self.get_job(project_id=project_id, region=region, job_id=job_id) + state = job.status.state + except ServerError as err: + self.log.info("Retrying. Dataproc API returned server error when waiting for job: %s", err) + + if state == JobStatus.State.ERROR: + raise AirflowException(f'Job failed:\n{job}') + if state == JobStatus.State.CANCELLED: + raise AirflowException(f'Job was cancelled:\n{job}') + + @GoogleBaseHook.fallback_to_default_project_id + def get_job( + self, + job_id: str, + project_id: str, + region: str = None, + location: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ) -> Job: + """ + Gets the resource representation for a job in a project. + + :param job_id: Id of the Dataproc job + :type job_id: str + :param project_id: Required. The ID of the Google Cloud project the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + else: + raise TypeError("missing 1 required keyword argument: 'region'") + client = self.get_job_client(region=region) + job = client.get_job( + request={'project_id': project_id, 'region': region, 'job_id': job_id}, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return job + + @GoogleBaseHook.fallback_to_default_project_id + def submit_job( + self, + job: Union[dict, Job], + project_id: str, + region: str = None, + location: Optional[str] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ) -> Job: + """ + Submits a job to a cluster. + + :param job: The job resource. If a dict is provided, + it must be of the same form as the protobuf message Job + :type job: Union[Dict, Job] + :param project_id: Required. The ID of the Google Cloud project the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param request_id: Optional. A tag that prevents multiple concurrent workflow instances + with the same tag from running. This mitigates risk of concurrent + instances started due to retries. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + else: + raise TypeError("missing 1 required keyword argument: 'region'") + client = self.get_job_client(region=region) + return client.submit_job( + request={'project_id': project_id, 'region': region, 'job': job, 'request_id': request_id}, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + + def submit( + self, + project_id: str, + job: dict, + region: str = 'global', + job_error_states: Optional[Iterable[str]] = None, + ) -> None: + """ + Submits Google Cloud Dataproc job. + + :param project_id: The id of Google Cloud Dataproc project. + :type project_id: str + :param job: The job to be submitted + :type job: dict + :param region: The region of Google Dataproc cluster. + :type region: str + :param job_error_states: Job states that should be considered error states. + :type job_error_states: List[str] + """ + # TODO: Remover one day + warnings.warn("This method is deprecated. Please use `submit_job`", DeprecationWarning, stacklevel=2) + job_object = self.submit_job(region=region, project_id=project_id, job=job) + job_id = job_object.reference.job_id + self.wait_for_job(job_id=job_id, region=region, project_id=project_id) + + @GoogleBaseHook.fallback_to_default_project_id + def cancel_job( + self, + job_id: str, + project_id: str, + region: Optional[str] = None, + location: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ) -> Job: + """ + Starts a job cancellation request. + + :param project_id: Required. The ID of the Google Cloud project that the job belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param job_id: Required. The job ID. + :type job_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + + if region is None: + warnings.warn( + "Default region value `global` will be deprecated. Please, provide region value.", + DeprecationWarning, + stacklevel=2, + ) + region = 'global' + client = self.get_job_client(region=region) + + job = client.cancel_job( + request={'project_id': project_id, 'region': region, 'job_id': job_id}, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return job + diff --git a/dags/utils/patched/dataproc_operator.py b/dags/utils/patched/dataproc_operator.py new file mode 100644 index 000000000..89bfec6f9 --- /dev/null +++ b/dags/utils/patched/dataproc_operator.py @@ -0,0 +1,2194 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +"""This module contains Google Dataproc operators.""" + +""" +I have copy pasted this from the providers-google/5.0.0 branch +https://github.com/apache/airflow/blob/providers-google/5.0.0/airflow/providers/google/cloud/operators/dataproc.py + +I originally created these patch files to fix a potential issue with +DataprocSubmit(Py)SparkJobOperators because the dataproc_v1beta2 +services/apis/etc were deprecated and removed from the google-cloud-dataproc +package already as of 2.6.0 and 3.0.0 and the apache-airflow-providers-google +(v6.0.0 latest) hasn't been updated to reflect this. + +This ended up not being the issue, but I am keeping these patched versions around +because they make it easier to troubleshoot (adding prints), and google will +eventually deprecate the dataproc_v1beta2 apis requiring us to upgrade airflow providers +and redeploy. By keeping these patched versions, we can simply modify utils/dataproc.py +to point to these patched versions. + +This file replaces all instances of dataproc_v1beta2 with dataproc_v1 +and imports our own patched versions of DataprocHook/DataProcJobBuilder +which will have replaced refs of dataproc_v1beta2 as well. +""" + + +import inspect +import ntpath +import os +import re +import time +import uuid +import warnings +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Sequence, Set, Tuple, Union + +from google.api_core.exceptions import AlreadyExists, NotFound +from google.api_core.retry import Retry, exponential_sleep_generator +from google.cloud.dataproc_v1 import Cluster +from google.protobuf.duration_pb2 import Duration +from google.protobuf.field_mask_pb2 import FieldMask + +from airflow.exceptions import AirflowException +from airflow.models import BaseOperator, BaseOperatorLink +from airflow.models.taskinstance import TaskInstance + +# These will be import from our patched version instead +# from airflow.providers.google.cloud.hooks.dataproc import DataprocHook, DataProcJobBuilder +from utils.patched.dataproc_hook import ( + DataprocHook, + DataProcJobBuilder, +) + +from airflow.providers.google.cloud.hooks.gcs import GCSHook +from airflow.utils import timezone + +DATAPROC_BASE_LINK = "https://console.cloud.google.com/dataproc" +DATAPROC_JOB_LOG_LINK = DATAPROC_BASE_LINK + "/jobs/{job_id}?region={region}&project={project_id}" +DATAPROC_CLUSTER_LINK = ( + DATAPROC_BASE_LINK + "/clusters/{cluster_name}/monitoring?region={region}&project={project_id}" +) + + +class DataprocJobLink(BaseOperatorLink): + """Helper class for constructing Dataproc Job link""" + + name = "Dataproc Job" + + def get_link(self, operator, dttm): + ti = TaskInstance(task=operator, execution_date=dttm) + job_conf = ti.xcom_pull(task_ids=operator.task_id, key="job_conf") + return ( + DATAPROC_JOB_LOG_LINK.format( + job_id=job_conf["job_id"], + region=job_conf["region"], + project_id=job_conf["project_id"], + ) + if job_conf + else "" + ) + + +class DataprocClusterLink(BaseOperatorLink): + """Helper class for constructing Dataproc Cluster link""" + + name = "Dataproc Cluster" + + def get_link(self, operator, dttm): + ti = TaskInstance(task=operator, execution_date=dttm) + cluster_conf = ti.xcom_pull(task_ids=operator.task_id, key="cluster_conf") + return ( + DATAPROC_CLUSTER_LINK.format( + cluster_name=cluster_conf["cluster_name"], + region=cluster_conf["region"], + project_id=cluster_conf["project_id"], + ) + if cluster_conf + else "" + ) + + +class ClusterGenerator: + """ + Create a new Dataproc Cluster. + + :param cluster_name: The name of the DataProc cluster to create. (templated) + :type cluster_name: str + :param project_id: The ID of the google cloud project in which + to create the cluster. (templated) + :type project_id: str + :param num_workers: The # of workers to spin up. If set to zero will + spin up cluster in a single node mode + :type num_workers: int + :param storage_bucket: The storage bucket to use, setting to None lets dataproc + generate a custom one for you + :type storage_bucket: str + :param init_actions_uris: List of GCS uri's containing + dataproc initialization scripts + :type init_actions_uris: list[str] + :param init_action_timeout: Amount of time executable scripts in + init_actions_uris has to complete + :type init_action_timeout: str + :param metadata: dict of key-value google compute engine metadata entries + to add to all instances + :type metadata: dict + :param image_version: the version of software inside the Dataproc cluster + :type image_version: str + :param custom_image: custom Dataproc image for more info see + https://cloud.google.com/dataproc/docs/guides/dataproc-images + :type custom_image: str + :param custom_image_project_id: project id for the custom Dataproc image, for more info see + https://cloud.google.com/dataproc/docs/guides/dataproc-images + :type custom_image_project_id: str + :param custom_image_family: family for the custom Dataproc image, + family name can be provide using --family flag while creating custom image, for more info see + https://cloud.google.com/dataproc/docs/guides/dataproc-images + :type custom_image_family: str + :param autoscaling_policy: The autoscaling policy used by the cluster. Only resource names + including projectid and location (region) are valid. Example: + ``projects/[projectId]/locations/[dataproc_region]/autoscalingPolicies/[policy_id]`` + :type autoscaling_policy: str + :param properties: dict of properties to set on + config files (e.g. spark-defaults.conf), see + https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#SoftwareConfig + :type properties: dict + :param optional_components: List of optional cluster components, for more info see + https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig#Component + :type optional_components: list[str] + :param num_masters: The # of master nodes to spin up + :type num_masters: int + :param master_machine_type: Compute engine machine type to use for the master node + :type master_machine_type: str + :param master_disk_type: Type of the boot disk for the master node + (default is ``pd-standard``). + Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or + ``pd-standard`` (Persistent Disk Hard Disk Drive). + :type master_disk_type: str + :param master_disk_size: Disk size for the master node + :type master_disk_size: int + :param worker_machine_type: Compute engine machine type to use for the worker nodes + :type worker_machine_type: str + :param worker_disk_type: Type of the boot disk for the worker node + (default is ``pd-standard``). + Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or + ``pd-standard`` (Persistent Disk Hard Disk Drive). + :type worker_disk_type: str + :param worker_disk_size: Disk size for the worker nodes + :type worker_disk_size: int + :param num_preemptible_workers: The # of preemptible worker nodes to spin up + :type num_preemptible_workers: int + :param labels: dict of labels to add to the cluster + :type labels: dict + :param zone: The zone where the cluster will be located. Set to None to auto-zone. (templated) + :type zone: str + :param network_uri: The network uri to be used for machine communication, cannot be + specified with subnetwork_uri + :type network_uri: str + :param subnetwork_uri: The subnetwork uri to be used for machine communication, + cannot be specified with network_uri + :type subnetwork_uri: str + :param internal_ip_only: If true, all instances in the cluster will only + have internal IP addresses. This can only be enabled for subnetwork + enabled networks + :type internal_ip_only: bool + :param tags: The GCE tags to add to all instances + :type tags: list[str] + :param region: The specified region where the dataproc cluster is created. + :type region: str + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param service_account: The service account of the dataproc instances. + :type service_account: str + :param service_account_scopes: The URIs of service account scopes to be included. + :type service_account_scopes: list[str] + :param idle_delete_ttl: The longest duration that cluster would keep alive while + staying idle. Passing this threshold will cause cluster to be auto-deleted. + A duration in seconds. + :type idle_delete_ttl: int + :param auto_delete_time: The time when cluster will be auto-deleted. + :type auto_delete_time: datetime.datetime + :param auto_delete_ttl: The life duration of cluster, the cluster will be + auto-deleted at the end of this duration. + A duration in seconds. (If auto_delete_time is set this parameter will be ignored) + :type auto_delete_ttl: int + :param customer_managed_key: The customer-managed key used for disk encryption + ``projects/[PROJECT_STORING_KEYS]/locations/[LOCATION]/keyRings/[KEY_RING_NAME]/cryptoKeys/[KEY_NAME]`` # noqa + :type customer_managed_key: str + """ + + def __init__( + self, + project_id: str, + num_workers: Optional[int] = None, + zone: Optional[str] = None, + network_uri: Optional[str] = None, + subnetwork_uri: Optional[str] = None, + internal_ip_only: Optional[bool] = None, + tags: Optional[List[str]] = None, + storage_bucket: Optional[str] = None, + init_actions_uris: Optional[List[str]] = None, + init_action_timeout: str = "10m", + metadata: Optional[Dict] = None, + custom_image: Optional[str] = None, + custom_image_project_id: Optional[str] = None, + custom_image_family: Optional[str] = None, + image_version: Optional[str] = None, + autoscaling_policy: Optional[str] = None, + properties: Optional[Dict] = None, + optional_components: Optional[List[str]] = None, + num_masters: int = 1, + master_machine_type: str = 'n1-standard-4', + master_disk_type: str = 'pd-standard', + master_disk_size: int = 1024, + worker_machine_type: str = 'n1-standard-4', + worker_disk_type: str = 'pd-standard', + worker_disk_size: int = 1024, + num_preemptible_workers: int = 0, + service_account: Optional[str] = None, + service_account_scopes: Optional[List[str]] = None, + idle_delete_ttl: Optional[int] = None, + auto_delete_time: Optional[datetime] = None, + auto_delete_ttl: Optional[int] = None, + customer_managed_key: Optional[str] = None, + **kwargs, + ) -> None: + + self.project_id = project_id + self.num_masters = num_masters + self.num_workers = num_workers + self.num_preemptible_workers = num_preemptible_workers + self.storage_bucket = storage_bucket + self.init_actions_uris = init_actions_uris + self.init_action_timeout = init_action_timeout + self.metadata = metadata + self.custom_image = custom_image + self.custom_image_project_id = custom_image_project_id + self.custom_image_family = custom_image_family + self.image_version = image_version + self.properties = properties or {} + self.optional_components = optional_components + self.master_machine_type = master_machine_type + self.master_disk_type = master_disk_type + self.master_disk_size = master_disk_size + self.autoscaling_policy = autoscaling_policy + self.worker_machine_type = worker_machine_type + self.worker_disk_type = worker_disk_type + self.worker_disk_size = worker_disk_size + self.zone = zone + self.network_uri = network_uri + self.subnetwork_uri = subnetwork_uri + self.internal_ip_only = internal_ip_only + self.tags = tags + self.service_account = service_account + self.service_account_scopes = service_account_scopes + self.idle_delete_ttl = idle_delete_ttl + self.auto_delete_time = auto_delete_time + self.auto_delete_ttl = auto_delete_ttl + self.customer_managed_key = customer_managed_key + self.single_node = num_workers == 0 + + if self.custom_image and self.image_version: + raise ValueError("The custom_image and image_version can't be both set") + + if self.custom_image_family and self.image_version: + raise ValueError("The image_version and custom_image_family can't be both set") + + if self.custom_image_family and self.custom_image: + raise ValueError("The custom_image and custom_image_family can't be both set") + + if self.single_node and self.num_preemptible_workers > 0: + raise ValueError("Single node cannot have preemptible workers.") + + def _get_init_action_timeout(self) -> dict: + match = re.match(r"^(\d+)([sm])$", self.init_action_timeout) + if match: + val = float(match.group(1)) + if match.group(2) == "s": + return {"seconds": int(val)} + elif match.group(2) == "m": + return {"seconds": int(timedelta(minutes=val).total_seconds())} + + raise AirflowException( + "DataprocClusterCreateOperator init_action_timeout" + " should be expressed in minutes or seconds. i.e. 10m, 30s" + ) + + def _build_gce_cluster_config(self, cluster_data): + if self.zone: + zone_uri = 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( + self.project_id, self.zone + ) + cluster_data['gce_cluster_config']['zone_uri'] = zone_uri + + if self.metadata: + cluster_data['gce_cluster_config']['metadata'] = self.metadata + + if self.network_uri: + cluster_data['gce_cluster_config']['network_uri'] = self.network_uri + + if self.subnetwork_uri: + cluster_data['gce_cluster_config']['subnetwork_uri'] = self.subnetwork_uri + + if self.internal_ip_only: + if not self.subnetwork_uri: + raise AirflowException("Set internal_ip_only to true only when you pass a subnetwork_uri.") + cluster_data['gce_cluster_config']['internal_ip_only'] = True + + if self.tags: + cluster_data['gce_cluster_config']['tags'] = self.tags + + if self.service_account: + cluster_data['gce_cluster_config']['service_account'] = self.service_account + + if self.service_account_scopes: + cluster_data['gce_cluster_config']['service_account_scopes'] = self.service_account_scopes + + return cluster_data + + def _build_lifecycle_config(self, cluster_data): + if self.idle_delete_ttl: + cluster_data['lifecycle_config']['idle_delete_ttl'] = {"seconds": self.idle_delete_ttl} + + if self.auto_delete_time: + utc_auto_delete_time = timezone.convert_to_utc(self.auto_delete_time) + cluster_data['lifecycle_config']['auto_delete_time'] = utc_auto_delete_time.strftime( + '%Y-%m-%dT%H:%M:%S.%fZ' + ) + elif self.auto_delete_ttl: + cluster_data['lifecycle_config']['auto_delete_ttl'] = {"seconds": int(self.auto_delete_ttl)} + + return cluster_data + + def _build_cluster_data(self): + if self.zone: + master_type_uri = ( + f"projects/{self.project_id}/zones/{self.zone}/machineTypes/{self.master_machine_type}" + ) + worker_type_uri = ( + f"projects/{self.project_id}/zones/{self.zone}/machineTypes/{self.worker_machine_type}" + ) + else: + master_type_uri = self.master_machine_type + worker_type_uri = self.worker_machine_type + + cluster_data = { + 'gce_cluster_config': {}, + 'master_config': { + 'num_instances': self.num_masters, + 'machine_type_uri': master_type_uri, + 'disk_config': { + 'boot_disk_type': self.master_disk_type, + 'boot_disk_size_gb': self.master_disk_size, + }, + }, + 'worker_config': { + 'num_instances': self.num_workers, + 'machine_type_uri': worker_type_uri, + 'disk_config': { + 'boot_disk_type': self.worker_disk_type, + 'boot_disk_size_gb': self.worker_disk_size, + }, + }, + 'secondary_worker_config': {}, + 'software_config': {}, + 'lifecycle_config': {}, + 'encryption_config': {}, + 'autoscaling_config': {}, + } + if self.num_preemptible_workers > 0: + cluster_data['secondary_worker_config'] = { + 'num_instances': self.num_preemptible_workers, + 'machine_type_uri': worker_type_uri, + 'disk_config': { + 'boot_disk_type': self.worker_disk_type, + 'boot_disk_size_gb': self.worker_disk_size, + }, + 'is_preemptible': True, + } + + if self.storage_bucket: + cluster_data['config_bucket'] = self.storage_bucket + + if self.image_version: + cluster_data['software_config']['image_version'] = self.image_version + + elif self.custom_image: + project_id = self.custom_image_project_id or self.project_id + custom_image_url = ( + 'https://www.googleapis.com/compute/beta/projects/' + '{}/global/images/{}'.format(project_id, self.custom_image) + ) + cluster_data['master_config']['image_uri'] = custom_image_url + if not self.single_node: + cluster_data['worker_config']['image_uri'] = custom_image_url + + elif self.custom_image_family: + project_id = self.custom_image_project_id or self.project_id + custom_image_url = ( + 'https://www.googleapis.com/compute/beta/projects/' + f'{project_id}/global/images/family/{self.custom_image_family}' + ) + cluster_data['master_config']['image_uri'] = custom_image_url + if not self.single_node: + cluster_data['worker_config']['image_uri'] = custom_image_url + + cluster_data = self._build_gce_cluster_config(cluster_data) + + if self.single_node: + self.properties["dataproc:dataproc.allow.zero.workers"] = "true" + + if self.properties: + cluster_data['software_config']['properties'] = self.properties + + if self.optional_components: + cluster_data['software_config']['optional_components'] = self.optional_components + + cluster_data = self._build_lifecycle_config(cluster_data) + + if self.init_actions_uris: + init_actions_dict = [ + {'executable_file': uri, 'execution_timeout': self._get_init_action_timeout()} + for uri in self.init_actions_uris + ] + cluster_data['initialization_actions'] = init_actions_dict + + if self.customer_managed_key: + cluster_data['encryption_config'] = {'gce_pd_kms_key_name': self.customer_managed_key} + if self.autoscaling_policy: + cluster_data['autoscaling_config'] = {'policy_uri': self.autoscaling_policy} + + return cluster_data + + def make(self): + """ + Helper method for easier migration. + :return: Dict representing Dataproc cluster. + """ + return self._build_cluster_data() + + +class DataprocCreateClusterOperator(BaseOperator): + """ + Create a new cluster on Google Cloud Dataproc. The operator will wait until the + creation is successful or an error occurs in the creation process. If the cluster + already exists and ``use_if_exists`` is True then the operator will: + + - if cluster state is ERROR then delete it if specified and raise error + - if cluster state is CREATING wait for it and then check for ERROR state + - if cluster state is DELETING wait for it and then create new cluster + + Please refer to + + https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters + + for a detailed explanation on the different parameters. Most of the configuration + parameters detailed in the link are available as a parameter to this operator. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:DataprocCreateClusterOperator` + + :param project_id: The ID of the google cloud project in which + to create the cluster. (templated) + :type project_id: str + :param cluster_name: Name of the cluster to create + :type cluster_name: str + :param labels: Labels that will be assigned to created cluster + :type labels: Dict[str, str] + :param cluster_config: Required. The cluster config to create. + If a dict is provided, it must be of the same form as the protobuf message + :class:`~google.cloud.dataproc_v1.types.ClusterConfig` + :type cluster_config: Union[Dict, google.cloud.dataproc_v1.types.ClusterConfig] + :param region: The specified region where the dataproc cluster is created. + :type region: str + :param delete_on_error: If true the cluster will be deleted if created with ERROR state. Default + value is true. + :type delete_on_error: bool + :param use_if_exists: If true use existing cluster + :type use_if_exists: bool + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``DeleteClusterRequest`` requests with the same id, then the second request will be ignored and the + first ``google.longrunning.Operation`` created and stored in the backend is returned. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'region', + 'cluster_config', + 'cluster_name', + 'labels', + 'impersonation_chain', + ) + template_fields_renderers = {'cluster_config': 'json'} + + operator_extra_links = (DataprocClusterLink(),) + + def __init__( + self, + *, + cluster_name: str, + region: Optional[str] = None, + project_id: Optional[str] = None, + cluster_config: Optional[Dict] = None, + labels: Optional[Dict] = None, + request_id: Optional[str] = None, + delete_on_error: bool = True, + use_if_exists: bool = True, + retry: Optional[Retry] = None, + timeout: float = 1 * 60 * 60, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + if region is None: + warnings.warn( + "Default region value `global` will be deprecated. Please, provide region value.", + DeprecationWarning, + stacklevel=2, + ) + region = 'global' + + # TODO: remove one day + if cluster_config is None: + warnings.warn( + "Passing cluster parameters by keywords to `{}` " + "will be deprecated. Please provide cluster_config object using `cluster_config` parameter. " + "You can use `airflow.dataproc.ClusterGenerator.generate_cluster` method to " + "obtain cluster object.".format(type(self).__name__), + DeprecationWarning, + stacklevel=1, + ) + # Remove result of apply defaults + if 'params' in kwargs: + del kwargs['params'] + + # Create cluster object from kwargs + if project_id is None: + raise AirflowException( + "project_id argument is required when building cluster from keywords parameters" + ) + kwargs["project_id"] = project_id + cluster_config = ClusterGenerator(**kwargs).make() + + # Remove from kwargs cluster params passed for backward compatibility + cluster_params = inspect.signature(ClusterGenerator.__init__).parameters + for arg in cluster_params: + if arg in kwargs: + del kwargs[arg] + + super().__init__(**kwargs) + + self.cluster_config = cluster_config + self.cluster_name = cluster_name + self.labels = labels + self.project_id = project_id + self.region = region + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.delete_on_error = delete_on_error + self.use_if_exists = use_if_exists + self.impersonation_chain = impersonation_chain + + def _create_cluster(self, hook: DataprocHook): + operation = hook.create_cluster( + project_id=self.project_id, + region=self.region, + cluster_name=self.cluster_name, + labels=self.labels, + cluster_config=self.cluster_config, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + cluster = operation.result() + self.log.info("Cluster created.") + return cluster + + def _delete_cluster(self, hook): + self.log.info("Deleting the cluster") + hook.delete_cluster(region=self.region, cluster_name=self.cluster_name, project_id=self.project_id) + + def _get_cluster(self, hook: DataprocHook) -> Cluster: + return hook.get_cluster( + project_id=self.project_id, + region=self.region, + cluster_name=self.cluster_name, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + + def _handle_error_state(self, hook: DataprocHook, cluster: Cluster) -> None: + if cluster.status.state != cluster.status.State.ERROR: + return + self.log.info("Cluster is in ERROR state") + gcs_uri = hook.diagnose_cluster( + region=self.region, cluster_name=self.cluster_name, project_id=self.project_id + ) + self.log.info('Diagnostic information for cluster %s available at: %s', self.cluster_name, gcs_uri) + if self.delete_on_error: + self._delete_cluster(hook) + raise AirflowException("Cluster was created but was in ERROR state.") + raise AirflowException("Cluster was created but is in ERROR state") + + def _wait_for_cluster_in_deleting_state(self, hook: DataprocHook) -> None: + time_left = self.timeout + for time_to_sleep in exponential_sleep_generator(initial=10, maximum=120): + if time_left < 0: + raise AirflowException(f"Cluster {self.cluster_name} is still DELETING state, aborting") + time.sleep(time_to_sleep) + time_left = time_left - time_to_sleep + try: + self._get_cluster(hook) + except NotFound: + break + + def _wait_for_cluster_in_creating_state(self, hook: DataprocHook) -> Cluster: + time_left = self.timeout + cluster = self._get_cluster(hook) + for time_to_sleep in exponential_sleep_generator(initial=10, maximum=120): + if cluster.status.state != cluster.status.State.CREATING: + break + if time_left < 0: + raise AirflowException(f"Cluster {self.cluster_name} is still CREATING state, aborting") + time.sleep(time_to_sleep) + time_left = time_left - time_to_sleep + cluster = self._get_cluster(hook) + return cluster + + def execute(self, context) -> dict: + self.log.info('Creating cluster: %s', self.cluster_name) + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + # Save data required to display extra link no matter what the cluster status will be + self.xcom_push( + context, + key="cluster_conf", + value={ + "cluster_name": self.cluster_name, + "region": self.region, + "project_id": self.project_id, + }, + ) + try: + # First try to create a new cluster + cluster = self._create_cluster(hook) + except AlreadyExists: + if not self.use_if_exists: + raise + self.log.info("Cluster already exists.") + cluster = self._get_cluster(hook) + + # Check if cluster is not in ERROR state + self._handle_error_state(hook, cluster) + if cluster.status.state == cluster.status.State.CREATING: + # Wait for cluster to be created + cluster = self._wait_for_cluster_in_creating_state(hook) + self._handle_error_state(hook, cluster) + elif cluster.status.state == cluster.status.State.DELETING: + # Wait for cluster to be deleted + self._wait_for_cluster_in_deleting_state(hook) + # Create new cluster + cluster = self._create_cluster(hook) + self._handle_error_state(hook, cluster) + + return Cluster.to_dict(cluster) + + +class DataprocScaleClusterOperator(BaseOperator): + """ + Scale, up or down, a cluster on Google Cloud Dataproc. + The operator will wait until the cluster is re-scaled. + + **Example**: :: + + t1 = DataprocClusterScaleOperator( + task_id='dataproc_scale', + project_id='my-project', + cluster_name='cluster-1', + num_workers=10, + num_preemptible_workers=10, + graceful_decommission_timeout='1h', + dag=dag) + + .. seealso:: + For more detail on about scaling clusters have a look at the reference: + https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/scaling-clusters + + :param cluster_name: The name of the cluster to scale. (templated) + :type cluster_name: str + :param project_id: The ID of the google cloud project in which + the cluster runs. (templated) + :type project_id: str + :param region: The region for the dataproc cluster. (templated) + :type region: str + :param num_workers: The new number of workers + :type num_workers: int + :param num_preemptible_workers: The new number of preemptible workers + :type num_preemptible_workers: int + :param graceful_decommission_timeout: Timeout for graceful YARN decommissioning. + Maximum value is 1d + :type graceful_decommission_timeout: str + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ['cluster_name', 'project_id', 'region', 'impersonation_chain'] + + operator_extra_links = (DataprocClusterLink(),) + + def __init__( + self, + *, + cluster_name: str, + project_id: Optional[str] = None, + region: str = 'global', + num_workers: int = 2, + num_preemptible_workers: int = 0, + graceful_decommission_timeout: Optional[str] = None, + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.project_id = project_id + self.region = region + self.cluster_name = cluster_name + self.num_workers = num_workers + self.num_preemptible_workers = num_preemptible_workers + self.graceful_decommission_timeout = graceful_decommission_timeout + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + # TODO: Remove one day + warnings.warn( + "The `{cls}` operator is deprecated, please use `DataprocUpdateClusterOperator` instead.".format( + cls=type(self).__name__ + ), + DeprecationWarning, + stacklevel=1, + ) + + def _build_scale_cluster_data(self) -> dict: + scale_data = { + 'config': { + 'worker_config': {'num_instances': self.num_workers}, + 'secondary_worker_config': {'num_instances': self.num_preemptible_workers}, + } + } + return scale_data + + @property + def _graceful_decommission_timeout_object(self) -> Optional[Dict[str, int]]: + if not self.graceful_decommission_timeout: + return None + + timeout = None + match = re.match(r"^(\d+)([smdh])$", self.graceful_decommission_timeout) + if match: + if match.group(2) == "s": + timeout = int(match.group(1)) + elif match.group(2) == "m": + val = float(match.group(1)) + timeout = int(timedelta(minutes=val).total_seconds()) + elif match.group(2) == "h": + val = float(match.group(1)) + timeout = int(timedelta(hours=val).total_seconds()) + elif match.group(2) == "d": + val = float(match.group(1)) + timeout = int(timedelta(days=val).total_seconds()) + + if not timeout: + raise AirflowException( + "DataprocClusterScaleOperator " + " should be expressed in day, hours, minutes or seconds. " + " i.e. 1d, 4h, 10m, 30s" + ) + + return {'seconds': timeout} + + def execute(self, context) -> None: + """Scale, up or down, a cluster on Google Cloud Dataproc.""" + self.log.info("Scaling cluster: %s", self.cluster_name) + + scaling_cluster_data = self._build_scale_cluster_data() + update_mask = ["config.worker_config.num_instances", "config.secondary_worker_config.num_instances"] + + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + # Save data required to display extra link no matter what the cluster status will be + self.xcom_push( + context, + key="cluster_conf", + value={ + "cluster_name": self.cluster_name, + "region": self.region, + "project_id": self.project_id, + }, + ) + operation = hook.update_cluster( + project_id=self.project_id, + region=self.region, + cluster_name=self.cluster_name, + cluster=scaling_cluster_data, + graceful_decommission_timeout=self._graceful_decommission_timeout_object, + update_mask={'paths': update_mask}, + ) + operation.result() + self.log.info("Cluster scaling finished") + + +class DataprocDeleteClusterOperator(BaseOperator): + """ + Deletes a cluster in a project. + + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to (templated). + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request (templated). + :type region: str + :param cluster_name: Required. The cluster name (templated). + :type cluster_name: str + :param cluster_uuid: Optional. Specifying the ``cluster_uuid`` means the RPC should fail + if cluster with specified UUID does not exist. + :type cluster_uuid: str + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``DeleteClusterRequest`` requests with the same id, then the second request will be ignored and the + first ``google.longrunning.Operation`` created and stored in the backend is returned. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ('project_id', 'region', 'cluster_name', 'impersonation_chain') + + def __init__( + self, + *, + project_id: str, + region: str, + cluster_name: str, + cluster_uuid: Optional[str] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.project_id = project_id + self.region = region + self.cluster_name = cluster_name + self.cluster_uuid = cluster_uuid + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context: dict) -> None: + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + self.log.info("Deleting cluster: %s", self.cluster_name) + operation = hook.delete_cluster( + project_id=self.project_id, + region=self.region, + cluster_name=self.cluster_name, + cluster_uuid=self.cluster_uuid, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + operation.result() + self.log.info("Cluster deleted.") + + +class DataprocJobBaseOperator(BaseOperator): + """ + The base class for operators that launch job on DataProc. + + :param job_name: The job name used in the DataProc cluster. This name by default + is the task_id appended with the execution data, but can be templated. The + name will always be appended with a random number to avoid name clashes. + :type job_name: str + :param cluster_name: The name of the DataProc cluster. + :type cluster_name: str + :param project_id: The ID of the Google Cloud project the cluster belongs to, + if not specified the project will be inferred from the provided GCP connection. + :type project_id: str + :param dataproc_properties: Map for the Hive properties. Ideal to put in + default arguments (templated) + :type dataproc_properties: dict + :param dataproc_jars: HCFS URIs of jar files to add to the CLASSPATH of the Hive server and Hadoop + MapReduce (MR) tasks. Can contain Hive SerDes and UDFs. (templated) + :type dataproc_jars: list + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param delegate_to: The account to impersonate using domain-wide delegation of authority, + if any. For this to work, the service account making the request must have + domain-wide delegation enabled. + :type delegate_to: str + :param labels: The labels to associate with this job. Label keys must contain 1 to 63 characters, + and must conform to RFC 1035. Label values may be empty, but, if present, must contain 1 to 63 + characters, and must conform to RFC 1035. No more than 32 labels can be associated with a job. + :type labels: dict + :param region: The specified region where the dataproc cluster is created. + :type region: str + :param job_error_states: Job states that should be considered error states. + Any states in this set will result in an error being raised and failure of the + task. Eg, if the ``CANCELLED`` state should also be considered a task failure, + pass in ``{'ERROR', 'CANCELLED'}``. Possible values are currently only + ``'ERROR'`` and ``'CANCELLED'``, but could change in the future. Defaults to + ``{'ERROR'}``. + :type job_error_states: set + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + :param asynchronous: Flag to return after submitting the job to the Dataproc API. + This is useful for submitting long running jobs and + waiting on them asynchronously using the DataprocJobSensor + :type asynchronous: bool + + :var dataproc_job_id: The actual "jobId" as submitted to the Dataproc API. + This is useful for identifying or linking to the job in the Google Cloud Console + Dataproc UI, as the actual "jobId" submitted to the Dataproc API is appended with + an 8 character random string. + :vartype dataproc_job_id: str + """ + + job_type = "" + + operator_extra_links = (DataprocJobLink(),) + + def __init__( + self, + *, + job_name: str = '{{task.task_id}}_{{ds_nodash}}', + cluster_name: str = "cluster-1", + project_id: Optional[str] = None, + dataproc_properties: Optional[Dict] = None, + dataproc_jars: Optional[List[str]] = None, + gcp_conn_id: str = 'google_cloud_default', + delegate_to: Optional[str] = None, + labels: Optional[Dict] = None, + region: Optional[str] = None, + job_error_states: Optional[Set[str]] = None, + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + asynchronous: bool = False, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.gcp_conn_id = gcp_conn_id + self.delegate_to = delegate_to + self.labels = labels + self.job_name = job_name + self.cluster_name = cluster_name + self.dataproc_properties = dataproc_properties + self.dataproc_jars = dataproc_jars + + if region is None: + warnings.warn( + "Default region value `global` will be deprecated. Please, provide region value.", + DeprecationWarning, + stacklevel=2, + ) + region = 'global' + self.region = region + + self.job_error_states = job_error_states if job_error_states is not None else {'ERROR'} + self.impersonation_chain = impersonation_chain + self.hook = DataprocHook(gcp_conn_id=gcp_conn_id, impersonation_chain=impersonation_chain) + self.project_id = self.hook.project_id if project_id is None else project_id + self.job_template = None + self.job = None + self.dataproc_job_id = None + self.asynchronous = asynchronous + + def create_job_template(self): + """Initialize `self.job_template` with default values""" + self.job_template = DataProcJobBuilder( + project_id=self.project_id, + task_id=self.task_id, + cluster_name=self.cluster_name, + job_type=self.job_type, + properties=self.dataproc_properties, + ) + self.job_template.set_job_name(self.job_name) + self.job_template.add_jar_file_uris(self.dataproc_jars) + self.job_template.add_labels(self.labels) + + def _generate_job_template(self) -> str: + if self.job_template: + job = self.job_template.build() + return job['job'] + raise Exception("Create a job template before") + + def execute(self, context): + if self.job_template: + self.job = self.job_template.build() + self.dataproc_job_id = self.job["job"]["reference"]["job_id"] + self.log.info('Submitting %s job %s', self.job_type, self.dataproc_job_id) + job_object = self.hook.submit_job( + project_id=self.project_id, job=self.job["job"], region=self.region + ) + job_id = job_object.reference.job_id + self.log.info('Job %s submitted successfully.', job_id) + # Save data required for extra links no matter what the job status will be + self.xcom_push( + context, + key='job_conf', + value={'job_id': job_id, 'region': self.region, 'project_id': self.project_id}, + ) + + if not self.asynchronous: + self.log.info('Waiting for job %s to complete', job_id) + self.hook.wait_for_job(job_id=job_id, region=self.region, project_id=self.project_id) + self.log.info('Job %s completed successfully.', job_id) + return job_id + else: + raise AirflowException("Create a job template before") + + def on_kill(self) -> None: + """ + Callback called when the operator is killed. + Cancel any running job. + """ + if self.dataproc_job_id: + self.hook.cancel_job(project_id=self.project_id, job_id=self.dataproc_job_id, region=self.region) + + +class DataprocSubmitPigJobOperator(DataprocJobBaseOperator): + """ + Start a Pig query Job on a Cloud DataProc cluster. The parameters of the operation + will be passed to the cluster. + + It's a good practice to define dataproc_* parameters in the default_args of the dag + like the cluster name and UDFs. + + .. code-block:: python + + default_args = { + "cluster_name": "cluster-1", + "dataproc_pig_jars": [ + "gs://example/udf/jar/datafu/1.2.0/datafu.jar", + "gs://example/udf/jar/gpig/1.2/gpig.jar", + ], + } + + You can pass a pig script as string or file reference. Use variables to pass on + variables for the pig script to be resolved on the cluster or use the parameters to + be resolved in the script as template parameters. + + **Example**: :: + + t1 = DataProcPigOperator( + task_id='dataproc_pig', + query='a_pig_script.pig', + variables={'out': 'gs://example/output/{{ds}}'}, + dag=dag) + + .. seealso:: + For more detail on about job submission have a look at the reference: + https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs + + :param query: The query or reference to the query + file (pg or pig extension). (templated) + :type query: str + :param query_uri: The HCFS URI of the script that contains the Pig queries. + :type query_uri: str + :param variables: Map of named parameters for the query. (templated) + :type variables: dict + """ + + template_fields = [ + 'query', + 'variables', + 'job_name', + 'cluster_name', + 'region', + 'dataproc_jars', + 'dataproc_properties', + 'impersonation_chain', + ] + template_ext = ('.pg', '.pig') + ui_color = '#0273d4' + job_type = 'pig_job' + + operator_extra_links = (DataprocJobLink(),) + + def __init__( + self, + *, + query: Optional[str] = None, + query_uri: Optional[str] = None, + variables: Optional[Dict] = None, + **kwargs, + ) -> None: + # TODO: Remove one day + warnings.warn( + "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use" + " `generate_job` method of `{cls}` to generate dictionary representing your job" + " and use it with the new operator.".format(cls=type(self).__name__), + DeprecationWarning, + stacklevel=1, + ) + + super().__init__(**kwargs) + self.query = query + self.query_uri = query_uri + self.variables = variables + + def generate_job(self): + """ + Helper method for easier migration to `DataprocSubmitJobOperator`. + :return: Dict representing Dataproc job + """ + self.create_job_template() + + if self.query is None: + self.job_template.add_query_uri(self.query_uri) + else: + self.job_template.add_query(self.query) + self.job_template.add_variables(self.variables) + return self._generate_job_template() + + def execute(self, context): + self.create_job_template() + + if self.query is None: + self.job_template.add_query_uri(self.query_uri) + else: + self.job_template.add_query(self.query) + self.job_template.add_variables(self.variables) + + super().execute(context) + + +class DataprocSubmitHiveJobOperator(DataprocJobBaseOperator): + """ + Start a Hive query Job on a Cloud DataProc cluster. + + :param query: The query or reference to the query file (q extension). + :type query: str + :param query_uri: The HCFS URI of the script that contains the Hive queries. + :type query_uri: str + :param variables: Map of named parameters for the query. + :type variables: dict + """ + + template_fields = [ + 'query', + 'variables', + 'job_name', + 'cluster_name', + 'region', + 'dataproc_jars', + 'dataproc_properties', + 'impersonation_chain', + ] + template_ext = ('.q', '.hql') + ui_color = '#0273d4' + job_type = 'hive_job' + + def __init__( + self, + *, + query: Optional[str] = None, + query_uri: Optional[str] = None, + variables: Optional[Dict] = None, + **kwargs, + ) -> None: + # TODO: Remove one day + warnings.warn( + "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use" + " `generate_job` method of `{cls}` to generate dictionary representing your job" + " and use it with the new operator.".format(cls=type(self).__name__), + DeprecationWarning, + stacklevel=1, + ) + + super().__init__(**kwargs) + self.query = query + self.query_uri = query_uri + self.variables = variables + if self.query is not None and self.query_uri is not None: + raise AirflowException('Only one of `query` and `query_uri` can be passed.') + + def generate_job(self): + """ + Helper method for easier migration to `DataprocSubmitJobOperator`. + :return: Dict representing Dataproc job + """ + self.create_job_template() + if self.query is None: + self.job_template.add_query_uri(self.query_uri) + else: + self.job_template.add_query(self.query) + self.job_template.add_variables(self.variables) + return self._generate_job_template() + + def execute(self, context): + self.create_job_template() + if self.query is None: + self.job_template.add_query_uri(self.query_uri) + else: + self.job_template.add_query(self.query) + self.job_template.add_variables(self.variables) + + super().execute(context) + + +class DataprocSubmitSparkSqlJobOperator(DataprocJobBaseOperator): + """ + Start a Spark SQL query Job on a Cloud DataProc cluster. + + :param query: The query or reference to the query file (q extension). (templated) + :type query: str + :param query_uri: The HCFS URI of the script that contains the SQL queries. + :type query_uri: str + :param variables: Map of named parameters for the query. (templated) + :type variables: dict + """ + + template_fields = [ + 'query', + 'variables', + 'job_name', + 'cluster_name', + 'region', + 'dataproc_jars', + 'dataproc_properties', + 'impersonation_chain', + ] + template_ext = ('.q',) + ui_color = '#0273d4' + job_type = 'spark_sql_job' + + def __init__( + self, + *, + query: Optional[str] = None, + query_uri: Optional[str] = None, + variables: Optional[Dict] = None, + **kwargs, + ) -> None: + # TODO: Remove one day + warnings.warn( + "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use" + " `generate_job` method of `{cls}` to generate dictionary representing your job" + " and use it with the new operator.".format(cls=type(self).__name__), + DeprecationWarning, + stacklevel=1, + ) + + super().__init__(**kwargs) + self.query = query + self.query_uri = query_uri + self.variables = variables + if self.query is not None and self.query_uri is not None: + raise AirflowException('Only one of `query` and `query_uri` can be passed.') + + def generate_job(self): + """ + Helper method for easier migration to `DataprocSubmitJobOperator`. + :return: Dict representing Dataproc job + """ + self.create_job_template() + if self.query is None: + self.job_template.add_query_uri(self.query_uri) + else: + self.job_template.add_query(self.query) + self.job_template.add_variables(self.variables) + return self._generate_job_template() + + def execute(self, context): + self.create_job_template() + if self.query is None: + self.job_template.add_query_uri(self.query_uri) + else: + self.job_template.add_query(self.query) + self.job_template.add_variables(self.variables) + + super().execute(context) + + +class DataprocSubmitSparkJobOperator(DataprocJobBaseOperator): + """ + Start a Spark Job on a Cloud DataProc cluster. + + :param main_jar: The HCFS URI of the jar file that contains the main class + (use this or the main_class, not both together). + :type main_jar: str + :param main_class: Name of the job class. (use this or the main_jar, not both + together). + :type main_class: str + :param arguments: Arguments for the job. (templated) + :type arguments: list + :param archives: List of archived files that will be unpacked in the work + directory. Should be stored in Cloud Storage. + :type archives: list + :param files: List of files to be copied to the working directory + :type files: list + """ + + template_fields = [ + 'arguments', + 'job_name', + 'cluster_name', + 'region', + 'dataproc_jars', + 'dataproc_properties', + 'impersonation_chain', + ] + ui_color = '#0273d4' + job_type = 'spark_job' + + def __init__( + self, + *, + main_jar: Optional[str] = None, + main_class: Optional[str] = None, + arguments: Optional[List] = None, + archives: Optional[List] = None, + files: Optional[List] = None, + **kwargs, + ) -> None: + # TODO: Remove one day + warnings.warn( + "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use" + " `generate_job` method of `{cls}` to generate dictionary representing your job" + " and use it with the new operator.".format(cls=type(self).__name__), + DeprecationWarning, + stacklevel=1, + ) + + super().__init__(**kwargs) + self.main_jar = main_jar + self.main_class = main_class + self.arguments = arguments + self.archives = archives + self.files = files + + def generate_job(self): + """ + Helper method for easier migration to `DataprocSubmitJobOperator`. + :return: Dict representing Dataproc job + """ + self.create_job_template() + self.job_template.set_main(self.main_jar, self.main_class) + self.job_template.add_args(self.arguments) + self.job_template.add_archive_uris(self.archives) + self.job_template.add_file_uris(self.files) + return self._generate_job_template() + + def execute(self, context): + self.create_job_template() + self.job_template.set_main(self.main_jar, self.main_class) + self.job_template.add_args(self.arguments) + self.job_template.add_archive_uris(self.archives) + self.job_template.add_file_uris(self.files) + + super().execute(context) + + +class DataprocSubmitHadoopJobOperator(DataprocJobBaseOperator): + """ + Start a Hadoop Job on a Cloud DataProc cluster. + + :param main_jar: The HCFS URI of the jar file containing the main class + (use this or the main_class, not both together). + :type main_jar: str + :param main_class: Name of the job class. (use this or the main_jar, not both + together). + :type main_class: str + :param arguments: Arguments for the job. (templated) + :type arguments: list + :param archives: List of archived files that will be unpacked in the work + directory. Should be stored in Cloud Storage. + :type archives: list + :param files: List of files to be copied to the working directory + :type files: list + """ + + template_fields = [ + 'arguments', + 'job_name', + 'cluster_name', + 'region', + 'dataproc_jars', + 'dataproc_properties', + 'impersonation_chain', + ] + ui_color = '#0273d4' + job_type = 'hadoop_job' + + def __init__( + self, + *, + main_jar: Optional[str] = None, + main_class: Optional[str] = None, + arguments: Optional[List] = None, + archives: Optional[List] = None, + files: Optional[List] = None, + **kwargs, + ) -> None: + # TODO: Remove one day + warnings.warn( + "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use" + " `generate_job` method of `{cls}` to generate dictionary representing your job" + " and use it with the new operator.".format(cls=type(self).__name__), + DeprecationWarning, + stacklevel=1, + ) + + super().__init__(**kwargs) + self.main_jar = main_jar + self.main_class = main_class + self.arguments = arguments + self.archives = archives + self.files = files + + def generate_job(self): + """ + Helper method for easier migration to `DataprocSubmitJobOperator`. + :return: Dict representing Dataproc job + """ + self.create_job_template() + self.job_template.set_main(self.main_jar, self.main_class) + self.job_template.add_args(self.arguments) + self.job_template.add_archive_uris(self.archives) + self.job_template.add_file_uris(self.files) + return self._generate_job_template() + + def execute(self, context): + self.create_job_template() + self.job_template.set_main(self.main_jar, self.main_class) + self.job_template.add_args(self.arguments) + self.job_template.add_archive_uris(self.archives) + self.job_template.add_file_uris(self.files) + + super().execute(context) + + +class DataprocSubmitPySparkJobOperator(DataprocJobBaseOperator): + """ + Start a PySpark Job on a Cloud DataProc cluster. + + :param main: [Required] The Hadoop Compatible Filesystem (HCFS) URI of the main + Python file to use as the driver. Must be a .py file. (templated) + :type main: str + :param arguments: Arguments for the job. (templated) + :type arguments: list + :param archives: List of archived files that will be unpacked in the work + directory. Should be stored in Cloud Storage. + :type archives: list + :param files: List of files to be copied to the working directory + :type files: list + :param pyfiles: List of Python files to pass to the PySpark framework. + Supported file types: .py, .egg, and .zip + :type pyfiles: list + """ + + template_fields = [ + 'main', + 'arguments', + 'job_name', + 'cluster_name', + 'region', + 'dataproc_jars', + 'dataproc_properties', + 'impersonation_chain', + ] + ui_color = '#0273d4' + job_type = 'pyspark_job' + + @staticmethod + def _generate_temp_filename(filename): + date = time.strftime('%Y%m%d%H%M%S') + return f"{date}_{str(uuid.uuid4())[:8]}_{ntpath.basename(filename)}" + + def _upload_file_temp(self, bucket, local_file): + """Upload a local file to a Google Cloud Storage bucket.""" + temp_filename = self._generate_temp_filename(local_file) + if not bucket: + raise AirflowException( + "If you want Airflow to upload the local file to a temporary bucket, set " + "the 'temp_bucket' key in the connection string" + ) + + self.log.info("Uploading %s to %s", local_file, temp_filename) + + GCSHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain).upload( + bucket_name=bucket, + object_name=temp_filename, + mime_type='application/x-python', + filename=local_file, + ) + return f"gs://{bucket}/{temp_filename}" + + def __init__( + self, + *, + main: str, + arguments: Optional[List] = None, + archives: Optional[List] = None, + pyfiles: Optional[List] = None, + files: Optional[List] = None, + **kwargs, + ) -> None: + # TODO: Remove one day + warnings.warn( + "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use" + " `generate_job` method of `{cls}` to generate dictionary representing your job" + " and use it with the new operator.".format(cls=type(self).__name__), + DeprecationWarning, + stacklevel=1, + ) + + super().__init__(**kwargs) + self.main = main + self.arguments = arguments + self.archives = archives + self.files = files + self.pyfiles = pyfiles + + def generate_job(self): + """ + Helper method for easier migration to `DataprocSubmitJobOperator`. + :return: Dict representing Dataproc job + """ + self.create_job_template() + # Check if the file is local, if that is the case, upload it to a bucket + if os.path.isfile(self.main): + cluster_info = self.hook.get_cluster( + project_id=self.project_id, region=self.region, cluster_name=self.cluster_name + ) + bucket = cluster_info['config']['config_bucket'] + self.main = f"gs://{bucket}/{self.main}" + self.job_template.set_python_main(self.main) + self.job_template.add_args(self.arguments) + self.job_template.add_archive_uris(self.archives) + self.job_template.add_file_uris(self.files) + self.job_template.add_python_file_uris(self.pyfiles) + + return self._generate_job_template() + + def execute(self, context): + self.create_job_template() + # Check if the file is local, if that is the case, upload it to a bucket + if os.path.isfile(self.main): + cluster_info = self.hook.get_cluster( + project_id=self.project_id, region=self.region, cluster_name=self.cluster_name + ) + bucket = cluster_info['config']['config_bucket'] + self.main = self._upload_file_temp(bucket, self.main) + + self.job_template.set_python_main(self.main) + self.job_template.add_args(self.arguments) + self.job_template.add_archive_uris(self.archives) + self.job_template.add_file_uris(self.files) + self.job_template.add_python_file_uris(self.pyfiles) + + super().execute(context) + + +class DataprocCreateWorkflowTemplateOperator(BaseOperator): + """ + Creates new workflow template. + + :param project_id: Required. The ID of the Google Cloud project the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param template: The Dataproc workflow template to create. If a dict is provided, + it must be of the same form as the protobuf message WorkflowTemplate. + :type template: Union[dict, WorkflowTemplate] + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + + template_fields = ("region", "template") + template_fields_renderers = {"template": "json"} + + def __init__( + self, + *, + template: Dict, + project_id: str, + region: str = None, + location: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ): + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + else: + raise TypeError("missing 1 required keyword argument: 'region'") + super().__init__(**kwargs) + self.region = region + self.template = template + self.project_id = project_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context): + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + self.log.info("Creating template") + try: + workflow = hook.create_workflow_template( + region=self.region, + template=self.template, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + self.log.info("Workflow %s created", workflow.name) + except AlreadyExists: + self.log.info("Workflow with given id already exists") + + +class DataprocInstantiateWorkflowTemplateOperator(BaseOperator): + """ + Instantiate a WorkflowTemplate on Google Cloud Dataproc. The operator will wait + until the WorkflowTemplate is finished executing. + + .. seealso:: + Please refer to: + https://cloud.google.com/dataproc/docs/reference/rest/v1beta2/projects.regions.workflowTemplates/instantiate + + :param template_id: The id of the template. (templated) + :type template_id: str + :param project_id: The ID of the google cloud project in which + the template runs + :type project_id: str + :param region: The specified region where the dataproc cluster is created. + :type region: str + :param parameters: a map of parameters for Dataproc Template in key-value format: + map (key: string, value: string) + Example: { "date_from": "2019-08-01", "date_to": "2019-08-02"}. + Values may not exceed 100 characters. Please refer to: + https://cloud.google.com/dataproc/docs/concepts/workflows/workflow-parameters + :type parameters: Dict[str, str] + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``SubmitJobRequest`` requests with the same id, then the second request will be ignored and the first + ``Job`` created and stored in the backend is returned. + It is recommended to always set this value to a UUID. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ['template_id', 'impersonation_chain', 'request_id', 'parameters'] + template_fields_renderers = {"parameters": "json"} + + def __init__( + self, + *, + template_id: str, + region: str, + project_id: Optional[str] = None, + version: Optional[int] = None, + request_id: Optional[str] = None, + parameters: Optional[Dict[str, str]] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + + self.template_id = template_id + self.parameters = parameters + self.version = version + self.project_id = project_id + self.region = region + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.request_id = request_id + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context): + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + self.log.info('Instantiating template %s', self.template_id) + operation = hook.instantiate_workflow_template( + project_id=self.project_id, + region=self.region, + template_name=self.template_id, + version=self.version, + request_id=self.request_id, + parameters=self.parameters, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + operation.result() + self.log.info('Template instantiated.') + + +class DataprocInstantiateInlineWorkflowTemplateOperator(BaseOperator): + """ + Instantiate a WorkflowTemplate Inline on Google Cloud Dataproc. The operator will + wait until the WorkflowTemplate is finished executing. + + .. seealso:: + Please refer to: + https://cloud.google.com/dataproc/docs/reference/rest/v1beta2/projects.regions.workflowTemplates/instantiateInline + + :param template: The template contents. (templated) + :type template: dict + :param project_id: The ID of the google cloud project in which + the template runs + :type project_id: str + :param region: The specified region where the dataproc cluster is created. + :type region: str + :param parameters: a map of parameters for Dataproc Template in key-value format: + map (key: string, value: string) + Example: { "date_from": "2019-08-01", "date_to": "2019-08-02"}. + Values may not exceed 100 characters. Please refer to: + https://cloud.google.com/dataproc/docs/concepts/workflows/workflow-parameters + :type parameters: Dict[str, str] + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``SubmitJobRequest`` requests with the same id, then the second request will be ignored and the first + ``Job`` created and stored in the backend is returned. + It is recommended to always set this value to a UUID. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ['template', 'impersonation_chain'] + template_fields_renderers = {"template": "json"} + + def __init__( + self, + *, + template: Dict, + region: str, + project_id: Optional[str] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.template = template + self.project_id = project_id + self.region = region + self.template = template + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context): + self.log.info('Instantiating Inline Template') + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + operation = hook.instantiate_inline_workflow_template( + template=self.template, + project_id=self.project_id, + region=self.region, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + operation.result() + self.log.info('Template instantiated.') + + +class DataprocSubmitJobOperator(BaseOperator): + """ + Submits a job to a cluster. + + :param project_id: Required. The ID of the Google Cloud project that the job belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param job: Required. The job resource. + If a dict is provided, it must be of the same form as the protobuf message + :class:`~google.cloud.dataproc_v1.types.Job` + :type job: Dict + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``SubmitJobRequest`` requests with the same id, then the second request will be ignored and the first + ``Job`` created and stored in the backend is returned. + It is recommended to always set this value to a UUID. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + :param asynchronous: Flag to return after submitting the job to the Dataproc API. + This is useful for submitting long running jobs and + waiting on them asynchronously using the DataprocJobSensor + :type asynchronous: bool + :param cancel_on_kill: Flag which indicates whether cancel the hook's job or not, when on_kill is called + :type cancel_on_kill: bool + :param wait_timeout: How many seconds wait for job to be ready. Used only if ``asynchronous`` is False + :type wait_timeout: int + """ + + template_fields = ('project_id', 'region', 'job', 'impersonation_chain', 'request_id') + template_fields_renderers = {"job": "json"} + + operator_extra_links = (DataprocJobLink(),) + + def __init__( + self, + *, + project_id: str, + job: Dict, + region: str = None, + location: Optional[str] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + asynchronous: bool = False, + cancel_on_kill: bool = True, + wait_timeout: Optional[int] = None, + **kwargs, + ) -> None: + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + else: + raise TypeError("missing 1 required keyword argument: 'region'") + super().__init__(**kwargs) + self.project_id = project_id + self.region = region + self.job = job + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + self.asynchronous = asynchronous + self.cancel_on_kill = cancel_on_kill + self.hook: Optional[DataprocHook] = None + self.job_id: Optional[str] = None + self.wait_timeout = wait_timeout + + def execute(self, context: Dict): + self.log.info("Submitting job") + self.hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + job_object = self.hook.submit_job( + project_id=self.project_id, + region=self.region, + job=self.job, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + job_id = job_object.reference.job_id + self.log.info('Job %s submitted successfully.', job_id) + # Save data required by extra links no matter what the job status will be + self.xcom_push( + context, + key="job_conf", + value={ + "job_id": job_id, + "region": self.region, + "project_id": self.project_id, + }, + ) + + if not self.asynchronous: + self.log.info('Waiting for job %s to complete', job_id) + self.hook.wait_for_job( + job_id=job_id, region=self.region, project_id=self.project_id, timeout=self.wait_timeout + ) + self.log.info('Job %s completed successfully.', job_id) + + self.job_id = job_id + return self.job_id + + def on_kill(self): + if self.job_id and self.cancel_on_kill: + self.hook.cancel_job(job_id=self.job_id, project_id=self.project_id, region=self.region) + + +class DataprocUpdateClusterOperator(BaseOperator): + """ + Updates a cluster in a project. + + :param project_id: Required. The ID of the Google Cloud project the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param location: (To be deprecated). The Cloud Dataproc region in which to handle the request. + :type location: str + :param cluster_name: Required. The cluster name. + :type cluster_name: str + :param cluster: Required. The changes to the cluster. + + If a dict is provided, it must be of the same form as the protobuf message + :class:`~google.cloud.dataproc_v1.types.Cluster` + :type cluster: Union[Dict, google.cloud.dataproc_v1.types.Cluster] + :param update_mask: Required. Specifies the path, relative to ``Cluster``, of the field to update. For + example, to change the number of workers in a cluster to 5, the ``update_mask`` parameter would be + specified as ``config.worker_config.num_instances``, and the ``PATCH`` request body would specify the + new value. If a dict is provided, it must be of the same form as the protobuf message + :class:`~google.protobuf.field_mask_pb2.FieldMask` + :type update_mask: Union[Dict, google.protobuf.field_mask_pb2.FieldMask] + :param graceful_decommission_timeout: Optional. Timeout for graceful YARN decommissioning. Graceful + decommissioning allows removing nodes from the cluster without interrupting jobs in progress. Timeout + specifies how long to wait for jobs in progress to finish before forcefully removing nodes (and + potentially interrupting jobs). Default timeout is 0 (for forceful decommission), and the maximum + allowed timeout is 1 day. + :type graceful_decommission_timeout: Union[Dict, google.protobuf.duration_pb2.Duration] + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``UpdateClusterRequest`` requests with the same id, then the second request will be ignored and the + first ``google.longrunning.Operation`` created and stored in the backend is returned. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ('impersonation_chain', 'cluster_name') + operator_extra_links = (DataprocClusterLink(),) + + def __init__( + self, + *, + cluster_name: str, + cluster: Union[Dict, Cluster], + update_mask: Union[Dict, FieldMask], + graceful_decommission_timeout: Union[Dict, Duration], + region: str = None, + location: Optional[str] = None, + request_id: Optional[str] = None, + project_id: Optional[str] = None, + retry: Retry = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ): + if region is None: + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=1, + ) + region = location + else: + raise TypeError("missing 1 required keyword argument: 'region'") + super().__init__(**kwargs) + self.project_id = project_id + self.region = region + self.cluster_name = cluster_name + self.cluster = cluster + self.update_mask = update_mask + self.graceful_decommission_timeout = graceful_decommission_timeout + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context: Dict): + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + # Save data required by extra links no matter what the cluster status will be + self.xcom_push( + context, + key="cluster_conf", + value={ + "cluster_name": self.cluster_name, + "region": self.region, + "project_id": self.project_id, + }, + ) + self.log.info("Updating %s cluster.", self.cluster_name) + operation = hook.update_cluster( + project_id=self.project_id, + region=self.region, + cluster_name=self.cluster_name, + cluster=self.cluster, + update_mask=self.update_mask, + graceful_decommission_timeout=self.graceful_decommission_timeout, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + operation.result() + self.log.info("Updated %s cluster.", self.cluster_name) + diff --git a/dev_webserver_config.py b/dev_webserver_config.py index f17626c45..015ce9af2 100644 --- a/dev_webserver_config.py +++ b/dev_webserver_config.py @@ -126,5 +126,3 @@ # APP_THEME = "superhero.css" # APP_THEME = "united.css" # APP_THEME = "yeti.css" - - diff --git a/docker-compose.yml b/docker-compose.yml index 2b43cd6de..58047c43f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,6 +5,7 @@ services: image: mysql:5.7 ports: - '3306:3306' + # command: ['--explicit_defaults_for_timestamp=1', '--character-set-server=utf8mb4'] command: ['--explicit_defaults_for_timestamp=1'] environment: MYSQL_ROOT_PASSWORD: secret @@ -51,10 +52,11 @@ services: - AIRFLOW_EMAIL_BACKEND=airflow.macros.log_email_backend.log_email_backend - AIRFLOW__KUBERNETES__IN_CLUSTER=False - URL=http://localhost:8000 - - WEBSERVER_USE_RBAC=False # URL-encoded dummy connections; note that we define some other connections # in the bin/run script - AIRFLOW_CONN_ADM_SFTP=ftp://myname:mypassword@myhost.com:8000?known_hosts=myhost.com+AAAABBBBB + # TODO(hwoo) - improve developer workflow by not loading all dags + # - AIRFLOW__CORE__DAGS_FOLDER=$AIRFLOW_HOME/devdags web: extends: diff --git a/plugins/__init__.py b/plugins/__init__.py index dfa614b12..61a2d790f 100644 --- a/plugins/__init__.py +++ b/plugins/__init__.py @@ -8,14 +8,8 @@ # Backfill Plugin Imports from backfill.main import Backfill -# Get RBAC config. -rbac_authentication_enabled = configuration.getboolean("webserver", "RBAC") - # Init the plugin in Webserver's "Admin" Menu with Menu Item as "Backfill" -if rbac_authentication_enabled == True: - backfill_admin_view = {"category" : "Admin", "name" : "Backfill (Alpha)", "view": Backfill()} -else: - backfill_admin_view = Backfill(category="Admin", name="Backfill (Alpha)") +backfill_admin_view = {"category" : "Admin", "name" : "Backfill (Alpha)", "view": Backfill()} # Creating a flask blueprint to integrate the templates folder backfill_blueprint = Blueprint( diff --git a/plugins/backfill/main.py b/plugins/backfill/main.py index 034ab6a2b..184ff5413 100644 --- a/plugins/backfill/main.py +++ b/plugins/backfill/main.py @@ -33,8 +33,6 @@ # Local file where history will be stored FILE = airflow_home_path + '/logs/backfill_history.txt' -rbac_authentication_enabled = configuration.getboolean("webserver", "RBAC") - # RE for remove ansi escape characters ansi_escape = re.compile(r'\x1B[@-_][0-?]*[ -/]*[@-~]') @@ -58,26 +56,15 @@ def file_ops(mode, data=None): return 1 def get_baseview(): - if rbac_authentication_enabled == True: - return AppBuilderBaseView - else: - return BaseView + return AppBuilderBaseView class Backfill(get_baseview()): route_base = "/admin/backfill/" - if rbac_authentication_enabled == True: - @app_builder_expose('/') - def list(self): - """ Render the backfill page to client with RBAC""" - return self.render_template("backfill_page.html", - rbac_authentication_enabled=rbac_authentication_enabled) - else: - @expose('/') - def base(self): - """ Render the backfill page to client """ - return self.render("backfill_page.html") + @app_builder_expose('/') + def list(self): + return self.render_template("backfill_page.html") @expose('/stream') @app_builder_expose('/stream') @@ -106,9 +93,10 @@ def stream(self): if use_task_regex == 'true': cmd.extend(['-t', str(task_regex)]) elif clear == 'false': + cmd.append('dags') cmd.append('backfill') if dry_run == 'true': - cmd.append('--dry_run') + cmd.append('--dry-run') if use_task_regex == 'true': cmd.extend(['-t', str(task_regex)]) diff --git a/plugins/mozmenu.py b/plugins/mozmenu.py index 9f5f6a0cf..a16648ea1 100644 --- a/plugins/mozmenu.py +++ b/plugins/mozmenu.py @@ -4,21 +4,20 @@ Based on an example at https://github.com/airflow-plugins/Getting-Started/blob/master/Tutorial/creating-ui-modification.md """ - - from airflow.plugins_manager import AirflowPlugin -from flask_admin.base import MenuLink -telemetry_airflow = MenuLink( - category="Mozilla", - name="telemetry-airflow on GitHub", - url="https://github.com/mozilla/telemetry-airflow") +telemetry_airflow = { + "name": "telemetry-airflow on GitHub", + "category": "Mozilla", + "href": "https://github.com/mozilla/telemetry-airflow" +} -wtmo_dev = MenuLink( - category="Mozilla", - name="WTMO Developer Guide", - url="https://mana.mozilla.org/wiki/display/DOPS/WTMO+Developer+Guide") +wtmo_dev = { + "name": "WTMO Developer Guide", + "category": "Mozilla", + "href": "https://mana.mozilla.org/wiki/display/DOPS/WTMO+Developer+Guide" +} class MozMenuPlugin(AirflowPlugin): name = "Mozilla" @@ -26,5 +25,5 @@ class MozMenuPlugin(AirflowPlugin): flask_blueprints = [] hooks = [] executors = [] - admin_views = [] - menu_links = [telemetry_airflow, wtmo_dev] + appbuilder_views = [] + appbuilder_menu_items = [telemetry_airflow, wtmo_dev] diff --git a/plugins/templates/backfill_page.html b/plugins/templates/backfill_page.html index 870447186..cfaf96109 100644 --- a/plugins/templates/backfill_page.html +++ b/plugins/templates/backfill_page.html @@ -1,4 +1,4 @@ -{% extends 'airflow/master.html' %} +{% extends base_template %} {% block title %}Airflow - Backfill Plugin{% endblock %} @@ -320,13 +320,12 @@ {% endblock %} {% block body %} -{% if rbac_authentication_enabled %} + {% block navbar %}
{% include 'appbuilder/navbar.html' %}
{% endblock %} -{%endif%}

Backfill (Alpha)

diff --git a/requirements.in b/requirements.in index 527dde22b..822ab3bec 100644 --- a/requirements.in +++ b/requirements.in @@ -1,16 +1,11 @@ boto3==1.15.18 botocore<1.19.0,>=1.18.0 kombu==4.6.10 # CeleryExecutor issues with 1.10.2 supposedly fixed in 1.10.5 airflow, but still observed issues on 1.10.7 -importlib-metadata==2.1.0 +importlib-metadata>=1.7 argcomplete==1.12.2 pandas-gbq==0.14.1 # removed hdfs -apache-airflow[celery,postgres,hive,jdbc,async,password,crypto,github_enterprise,datadog,statsd,s3,mysql,google_auth,gcp_api,kubernetes]==1.10.15 -apache-airflow-upgrade-check -# Airflow 2.0 backported providers -apache-airflow-backport-providers-google -apache-airflow-backport-providers-amazon -apache-airflow-backport-providers-http +apache-airflow[amazon,celery,postgres,apache.hive,jdbc,async,password,crypto,github_enterprise,datadog,statsd,mysql,google_auth,cncf.kubernetes]==2.1.1 cryptography>=3.2 mozlogging retrying @@ -19,11 +14,14 @@ redis hiredis requests jsonschema +flask-admin Flask-OAuthlib +Authlib~=0.15.3 +Flask-AppBuilder>=3.3.0 pytz -werkzeug==0.16.0 +werkzeug>=1.0.1,~=1.0 # The next requirements are for kubernetes-client/python -urllib3>=1.24.2 # MIT +urllib3>=1.24.2 # MIT ipaddress>=1.0.17;python_version=="2.7" # PSF websocket-client>=0.32.0,!=0.40.0,!=0.41.*,!=0.42.* # LGPLv2+ # Pin to older version, newer version has issues @@ -31,4 +29,13 @@ JPype1==0.7.1 shelljob==0.5.6 # Fix no inspection available issue # https://github.com/apache/airflow/issues/8211 -SQLAlchemy==1.3.15 +SQLAlchemy>=1.3.18 +# Airflow 2 no longer installs http provider by default, until chardet becomes an optional dependency of requests +apache-airflow-providers-http +airflow-provider-fivetran +# Upgrade google dataproc provider to fix beta client clusterConfig and mismatch issues +apache-airflow-providers-google==5.0.0 +# 2.4.0 is broken for dataproc cluster create/delete +# 2.6.0 and 3.0.0 are newer but not compatible with apache-airflow-providers-google +# yet until maybe v7.0.0 bc 'google.cloud.dataproc_v1beta2' is deprecated +google-cloud-dataproc==2.5.0 diff --git a/requirements.txt b/requirements.txt index f9d4f420c..53eb2b249 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,156 +4,184 @@ # # pip-compile # +airflow-provider-fivetran==1.0.1 # via -r requirements.in alembic==1.6.5 # via apache-airflow amqp==2.6.1 # via kombu -apache-airflow-backport-providers-amazon==2021.3.3 # via -r requirements.in -apache-airflow-backport-providers-google==2021.3.3 # via -r requirements.in -apache-airflow-backport-providers-http==2021.4.10 # via -r requirements.in -apache-airflow-upgrade-check==1.4.0 # via -r requirements.in -apache-airflow[async,celery,crypto,datadog,gcp_api,github_enterprise,google_auth,hive,jdbc,kubernetes,mysql,password,postgres,s3,statsd]==1.10.15 # via -r requirements.in, apache-airflow-backport-providers-amazon, apache-airflow-backport-providers-google, apache-airflow-upgrade-check -apispec[yaml]==1.3.3 # via flask-appbuilder -argcomplete==1.12.2 # via -r requirements.in, apache-airflow +anyio==3.3.0 # via httpcore +apache-airflow-providers-amazon==2.1.0 # via apache-airflow +apache-airflow-providers-apache-hive==2.0.1 # via apache-airflow +apache-airflow-providers-celery==2.0.0 # via apache-airflow +apache-airflow-providers-cncf-kubernetes==2.0.2 # via apache-airflow +apache-airflow-providers-datadog==2.0.0 # via apache-airflow +apache-airflow-providers-ftp==2.0.0 # via apache-airflow +apache-airflow-providers-google==5.0.0 # via -r requirements.in +apache-airflow-providers-http==2.0.0 # via -r requirements.in +apache-airflow-providers-imap==2.0.0 # via apache-airflow +apache-airflow-providers-jdbc==2.0.0 # via apache-airflow +apache-airflow-providers-mysql==2.1.0 # via apache-airflow +apache-airflow-providers-postgres==2.0.0 # via apache-airflow +apache-airflow-providers-sqlite==2.0.0 # via apache-airflow +apache-airflow[amazon,apache.hive,async,celery,cncf.kubernetes,crypto,datadog,github_enterprise,google_auth,jdbc,mysql,password,postgres,statsd]==2.1.1 # via -r requirements.in, airflow-provider-fivetran, apache-airflow-providers-amazon, apache-airflow-providers-apache-hive, apache-airflow-providers-celery, apache-airflow-providers-cncf-kubernetes, apache-airflow-providers-datadog, apache-airflow-providers-google, apache-airflow-providers-http, apache-airflow-providers-jdbc, apache-airflow-providers-mysql, apache-airflow-providers-postgres +apispec[yaml]==3.3.2 # via flask-appbuilder +argcomplete==1.12.2 # via -r requirements.in, apache-airflow, nox attrs==20.3.0 # via apache-airflow, cattrs, jsonschema +authlib==0.15.4 # via -r requirements.in babel==2.9.1 # via flask-babel +backports.entry-points-selectable==1.1.0 # via virtualenv bcrypt==3.2.0 # via apache-airflow, flask-bcrypt billiard==3.6.4.0 # via celery -boto3==1.15.18 # via -r requirements.in, apache-airflow, apache-airflow-backport-providers-amazon, watchtower -botocore==1.18.18 # via -r requirements.in, apache-airflow-backport-providers-amazon, boto3, s3transfer +blinker==1.4 # via apache-airflow +boto3==1.15.18 # via -r requirements.in, apache-airflow-providers-amazon, watchtower +botocore==1.18.18 # via -r requirements.in, boto3, s3transfer cached-property==1.5.2 # via apache-airflow cachetools==4.2.2 # via google-auth -cattrs==1.7.1 # via apache-airflow -celery==4.4.7 # via apache-airflow, flower -certifi==2021.5.30 # via kubernetes, requests +cattrs==1.5.0 # via apache-airflow +celery==4.4.7 # via apache-airflow-providers-celery, flower +certifi==2021.5.30 # via httpx, kubernetes, requests cffi==1.14.6 # via bcrypt, cryptography, google-crc32c chardet==3.0.4 # via requests -click==7.1.2 # via flask, flask-appbuilder, hmsclient -colorama==0.4.4 # via flask-appbuilder -colorlog==4.0.2 # via apache-airflow -configparser==3.5.3 # via apache-airflow +charset-normalizer==2.0.4 # via httpx +click==7.1.2 # via clickclick, flask, flask-appbuilder, hmsclient +clickclick==20.10.2 # via apache-airflow +colorama==0.4.4 # via flask-appbuilder, rich +colorlog==4.0.2 # via apache-airflow, nox +commonmark==0.9.1 # via rich croniter==0.3.37 # via apache-airflow -cryptography==3.4.7 # via -r requirements.in, apache-airflow, pyopenssl -datadog==0.42.0 # via apache-airflow +cryptography==3.4.7 # via -r requirements.in, apache-airflow, apache-airflow-providers-cncf-kubernetes, authlib, pyopenssl +datadog==0.42.0 # via apache-airflow-providers-datadog defusedxml==0.7.1 # via python3-openid dill==0.3.4 # via apache-airflow +distlib==0.3.2 # via virtualenv dnspython==1.16.0 # via email-validator, eventlet -docutils==0.17.1 # via python-daemon -email-validator==1.1.3 # via apache-airflow, flask-appbuilder +docutils==0.16 # via apache-airflow, python-daemon +email-validator==1.1.3 # via flask-appbuilder eventlet==0.31.1 # via apache-airflow -flask-admin==1.5.4 # via apache-airflow -flask-appbuilder==2.3.4 # via apache-airflow +filelock==3.0.12 # via virtualenv +flask-admin==1.5.8 # via -r requirements.in +flask-appbuilder==3.3.2 # via -r requirements.in, apache-airflow flask-babel==1.0.0 # via flask-appbuilder flask-bcrypt==0.7.1 # via apache-airflow -flask-caching==1.3.3 # via apache-airflow +flask-caching==1.10.1 # via apache-airflow flask-jwt-extended==3.25.1 # via flask-appbuilder flask-login==0.4.1 # via apache-airflow, flask-appbuilder flask-oauthlib==0.9.5 # via -r requirements.in, apache-airflow flask-openid==1.2.5 # via flask-appbuilder flask-sqlalchemy==2.5.1 # via flask-appbuilder -flask-swagger==0.2.14 # via apache-airflow flask-wtf==0.14.3 # via apache-airflow, flask-appbuilder -flask==1.1.4 # via apache-airflow, flask-admin, flask-appbuilder, flask-babel, flask-bcrypt, flask-caching, flask-jwt-extended, flask-login, flask-oauthlib, flask-openid, flask-sqlalchemy, flask-swagger, flask-wtf -flower==0.9.7 # via apache-airflow -funcsigs==1.0.2 # via apache-airflow -future==0.18.2 # via apache-airflow, pyhive +flask==1.1.4 # via apache-airflow, flask-admin, flask-appbuilder, flask-babel, flask-bcrypt, flask-caching, flask-jwt-extended, flask-login, flask-oauthlib, flask-openid, flask-sqlalchemy, flask-wtf +flower==0.9.7 # via apache-airflow-providers-celery +future==0.18.2 # via pyhive gevent==21.1.2 # via apache-airflow -google-ads==7.0.0 # via apache-airflow-backport-providers-google -google-api-core[grpc,grpcgcp]==1.31.0 # via apache-airflow-backport-providers-google, google-ads, google-api-python-client, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-bigtable, google-cloud-container, google-cloud-core, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-dlp, google-cloud-kms, google-cloud-language, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-secret-manager, google-cloud-spanner, google-cloud-speech, google-cloud-tasks, google-cloud-texttospeech, google-cloud-translate, google-cloud-videointelligence, google-cloud-vision, google-cloud-workflows -google-api-python-client==1.12.8 # via apache-airflow, apache-airflow-backport-providers-google -google-auth-httplib2==0.1.0 # via apache-airflow, apache-airflow-backport-providers-google, google-api-python-client +google-ads==13.0.0 # via apache-airflow-providers-google +google-api-core[grpc,grpcgcp]==1.31.0 # via apache-airflow-providers-google, google-ads, google-api-python-client, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-bigtable, google-cloud-container, google-cloud-core, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-dlp, google-cloud-kms, google-cloud-language, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-secret-manager, google-cloud-spanner, google-cloud-speech, google-cloud-tasks, google-cloud-texttospeech, google-cloud-translate, google-cloud-videointelligence, google-cloud-vision, google-cloud-workflows +google-api-python-client==1.12.8 # via apache-airflow-providers-google +google-auth-httplib2==0.1.0 # via apache-airflow-providers-google, google-api-python-client google-auth-oauthlib==0.4.4 # via google-ads, pandas-gbq, pydata-google-auth -google-auth==1.32.1 # via apache-airflow, apache-airflow-backport-providers-google, google-api-core, google-api-python-client, google-auth-httplib2, google-auth-oauthlib, google-cloud-core, google-cloud-storage, kubernetes, pandas-gbq, pydata-google-auth +google-auth==1.32.1 # via apache-airflow-providers-google, google-api-core, google-api-python-client, google-auth-httplib2, google-auth-oauthlib, google-cloud-core, google-cloud-storage, kubernetes, pandas-gbq, pydata-google-auth google-cloud-appengine-logging==0.1.1 # via google-cloud-logging google-cloud-audit-log==0.1.0 # via google-cloud-logging -google-cloud-automl==2.4.0 # via apache-airflow-backport-providers-google -google-cloud-bigquery-datatransfer==3.3.0 # via apache-airflow-backport-providers-google +google-cloud-automl==2.4.0 # via apache-airflow-providers-google +google-cloud-bigquery-datatransfer==3.3.0 # via apache-airflow-providers-google google-cloud-bigquery-storage==2.6.0 # via google-cloud-bigquery google-cloud-bigquery[bqstorage,pandas]==2.20.0 # via pandas-gbq -google-cloud-bigtable==1.7.0 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-container==1.0.1 # via apache-airflow, apache-airflow-backport-providers-google +google-cloud-bigtable==1.7.0 # via apache-airflow-providers-google +google-cloud-container==1.0.1 # via apache-airflow-providers-google google-cloud-core==1.7.1 # via google-cloud-bigquery, google-cloud-bigtable, google-cloud-logging, google-cloud-spanner, google-cloud-storage, google-cloud-translate -google-cloud-datacatalog==3.3.0 # via apache-airflow-backport-providers-google -google-cloud-dataproc==2.4.0 # via apache-airflow-backport-providers-google -google-cloud-dlp==1.0.0 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-kms==2.4.0 # via apache-airflow-backport-providers-google -google-cloud-language==1.3.0 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-logging==2.5.0 # via apache-airflow-backport-providers-google -google-cloud-memcache==1.1.0 # via apache-airflow-backport-providers-google -google-cloud-monitoring==2.4.0 # via apache-airflow-backport-providers-google -google-cloud-os-login==2.2.1 # via apache-airflow-backport-providers-google -google-cloud-pubsub==2.6.1 # via apache-airflow-backport-providers-google -google-cloud-redis==2.2.0 # via apache-airflow-backport-providers-google -google-cloud-secret-manager==1.0.0 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-spanner==1.19.1 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-speech==1.3.2 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-storage==1.40.0 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-tasks==2.4.0 # via apache-airflow-backport-providers-google -google-cloud-texttospeech==1.0.1 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-translate==1.7.0 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-videointelligence==1.16.1 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-vision==1.0.0 # via apache-airflow, apache-airflow-backport-providers-google -google-cloud-workflows==1.1.0 # via apache-airflow-backport-providers-google +google-cloud-datacatalog==3.3.0 # via apache-airflow-providers-google +google-cloud-dataproc==2.5.0 # via -r requirements.in, apache-airflow-providers-google +google-cloud-dlp==1.0.0 # via apache-airflow-providers-google +google-cloud-kms==2.4.0 # via apache-airflow-providers-google +google-cloud-language==1.3.0 # via apache-airflow-providers-google +google-cloud-logging==2.5.0 # via apache-airflow-providers-google +google-cloud-memcache==1.0.0 # via apache-airflow-providers-google +google-cloud-monitoring==2.4.0 # via apache-airflow-providers-google +google-cloud-os-login==2.2.1 # via apache-airflow-providers-google +google-cloud-pubsub==2.6.1 # via apache-airflow-providers-google +google-cloud-redis==2.2.0 # via apache-airflow-providers-google +google-cloud-secret-manager==1.0.0 # via apache-airflow-providers-google +google-cloud-spanner==1.19.1 # via apache-airflow-providers-google +google-cloud-speech==1.3.2 # via apache-airflow-providers-google +google-cloud-storage==1.40.0 # via apache-airflow-providers-google +google-cloud-tasks==2.4.0 # via apache-airflow-providers-google +google-cloud-texttospeech==1.0.1 # via apache-airflow-providers-google +google-cloud-translate==1.7.0 # via apache-airflow-providers-google +google-cloud-videointelligence==1.16.1 # via apache-airflow-providers-google +google-cloud-vision==1.0.0 # via apache-airflow-providers-google +google-cloud-workflows==1.1.0 # via apache-airflow-providers-google google-crc32c==1.1.2 # via google-resumable-media google-resumable-media==1.3.1 # via google-cloud-bigquery, google-cloud-storage googleapis-common-protos[grpc]==1.53.0 # via google-ads, google-api-core, google-cloud-audit-log, grpc-google-iam-v1 graphviz==0.16 # via apache-airflow greenlet==1.1.0 # via apache-airflow, eventlet, gevent grpc-google-iam-v1==0.12.3 # via google-cloud-bigtable, google-cloud-container, google-cloud-datacatalog, google-cloud-kms, google-cloud-pubsub, google-cloud-secret-manager, google-cloud-spanner, google-cloud-tasks -grpcio-gcp==0.2.2 # via apache-airflow, apache-airflow-backport-providers-google, google-api-core +grpcio-gcp==0.2.2 # via apache-airflow-providers-google, google-api-core grpcio==1.38.1 # via google-ads, google-api-core, google-cloud-bigquery, google-cloud-pubsub, googleapis-common-protos, grpc-google-iam-v1, grpcio-gcp gunicorn==20.1.0 # via apache-airflow +h11==0.12.0 # via httpcore hiredis==2.0.0 # via -r requirements.in -hmsclient==0.1.1 # via apache-airflow +hmsclient==0.1.1 # via apache-airflow-providers-apache-hive +httpcore==0.13.6 # via httpx httplib2==0.19.1 # via google-api-python-client, google-auth-httplib2 +httpx==0.19.0 # via apache-airflow, apache-airflow-providers-google humanize==3.10.0 # via flower -idna==2.10 # via email-validator, requests -importlib-metadata==2.1.0 # via -r requirements.in, apache-airflow, apache-airflow-upgrade-check, argcomplete, importlib-resources, jsonschema, kombu +idna==2.10 # via anyio, email-validator, requests, rfc3986 +importlib-metadata==1.7.0 # via -r requirements.in, apache-airflow, argcomplete, importlib-resources, jsonschema, kombu, nox, virtualenv importlib-resources==1.5.0 # via apache-airflow +inflection==0.5.1 # via apache-airflow iso8601==0.1.14 # via apache-airflow -itsdangerous==1.1.0 # via flask, flask-wtf -jaydebeapi==1.2.3 # via apache-airflow -jinja2==2.11.3 # via apache-airflow, flask, flask-babel, python-nvd3 +isodate==0.6.0 # via openapi-schema-validator +itsdangerous==1.1.0 # via apache-airflow, flask, flask-wtf +jaydebeapi==1.2.3 # via apache-airflow-providers-jdbc +jinja2==2.11.3 # via apache-airflow, flask, flask-babel, python-nvd3, swagger-ui-bundle jmespath==0.10.0 # via boto3, botocore -jpype1==0.7.1 # via -r requirements.in, apache-airflow, jaydebeapi -json-merge-patch==0.2 # via apache-airflow, apache-airflow-backport-providers-google -jsonschema==3.2.0 # via -r requirements.in, apache-airflow, flask-appbuilder +jpype1==0.7.1 # via -r requirements.in, jaydebeapi +json-merge-patch==0.2 # via apache-airflow-providers-google +jsonschema==3.2.0 # via -r requirements.in, apache-airflow, flask-appbuilder, openapi-schema-validator, openapi-spec-validator kombu==4.6.10 # via -r requirements.in, celery -kubernetes==11.0.0 # via apache-airflow +kubernetes==11.0.0 # via apache-airflow-providers-cncf-kubernetes lazy-object-proxy==1.4.3 # via apache-airflow libcst==0.3.19 # via google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-os-login, google-cloud-pubsub, google-cloud-workflows -lockfile==0.12.2 # via python-daemon +lockfile==0.12.2 # via apache-airflow, python-daemon mako==1.1.4 # via alembic markdown==2.6.11 # via apache-airflow -markupsafe==2.0.1 # via jinja2, mako, wtforms +markupsafe==1.1.1 # via apache-airflow, jinja2, mako, wtforms marshmallow-enum==1.5.1 # via flask-appbuilder -marshmallow-sqlalchemy==0.23.1 # via apache-airflow, flask-appbuilder -marshmallow==2.21.0 # via flask-appbuilder, marshmallow-enum, marshmallow-sqlalchemy +marshmallow-oneofschema==3.0.1 # via apache-airflow +marshmallow-sqlalchemy==0.23.1 # via flask-appbuilder +marshmallow==3.13.0 # via flask-appbuilder, marshmallow-enum, marshmallow-oneofschema, marshmallow-sqlalchemy mozlogging==0.1.0 # via -r requirements.in mypy-extensions==0.4.3 # via typing-inspect -mysqlclient==1.3.14 # via apache-airflow +mysql-connector-python==8.0.22 # via apache-airflow-providers-mysql +mysqlclient==1.3.14 # via apache-airflow-providers-mysql natsort==7.1.1 # via croniter newrelic==6.4.4.161 # via -r requirements.in -numpy==1.21.0 # via pandas, pyarrow +nox==2020.12.31 # via google-ads +numpy==1.21.0 # via apache-airflow, pandas, pyarrow oauthlib==2.1.0 # via apache-airflow, flask-oauthlib, requests-oauthlib -packaging==21.0 # via apache-airflow, apache-airflow-upgrade-check, google-api-core, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-redis, google-cloud-tasks, google-cloud-workflows -pandas-gbq==0.14.1 # via -r requirements.in, apache-airflow, apache-airflow-backport-providers-google +openapi-schema-validator==0.1.5 # via openapi-spec-validator +openapi-spec-validator==0.3.1 # via apache-airflow +packaging==21.0 # via google-api-core, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-monitoring, google-cloud-os-login, google-cloud-redis, google-cloud-tasks, google-cloud-workflows +pandas-gbq==0.14.1 # via -r requirements.in, apache-airflow-providers-google pandas==1.3.0 # via apache-airflow, google-cloud-bigquery, pandas-gbq -pendulum==1.4.4 # via apache-airflow +pendulum==2.1.2 # via apache-airflow +platformdirs==2.2.0 # via virtualenv prison==0.1.3 # via flask-appbuilder prometheus-client==0.8.0 # via flower -proto-plus==1.19.0 # via google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-tasks, google-cloud-workflows -protobuf==3.17.3 # via google-ads, google-api-core, google-cloud-audit-log, google-cloud-bigquery, googleapis-common-protos, proto-plus +proto-plus==1.19.0 # via google-ads, google-cloud-appengine-logging, google-cloud-automl, google-cloud-bigquery, google-cloud-bigquery-datatransfer, google-cloud-bigquery-storage, google-cloud-datacatalog, google-cloud-dataproc, google-cloud-kms, google-cloud-logging, google-cloud-memcache, google-cloud-monitoring, google-cloud-os-login, google-cloud-pubsub, google-cloud-redis, google-cloud-tasks, google-cloud-workflows +protobuf==3.17.3 # via google-api-core, google-cloud-audit-log, google-cloud-bigquery, googleapis-common-protos, mysql-connector-python, proto-plus psutil==5.8.0 # via apache-airflow -psycopg2-binary==2.9.1 # via apache-airflow +psycopg2-binary==2.9.1 # via apache-airflow-providers-postgres pure-sasl==0.6.2 # via thrift-sasl +py==1.10.0 # via nox pyarrow==4.0.1 # via google-cloud-bigquery pyasn1-modules==0.2.8 # via google-auth pyasn1==0.4.8 # via pyasn1-modules, rsa pycparser==2.20 # via cffi pydata-google-auth==1.2.0 # via pandas-gbq -pygments==2.9.0 # via apache-airflow -pyhive[hive]==0.6.4 # via apache-airflow -pyjwt==1.7.1 # via flask-appbuilder, flask-jwt-extended -pyopenssl==20.0.1 # via apache-airflow, apache-airflow-backport-providers-google +pygments==2.9.0 # via apache-airflow, rich +pyhive[hive]==0.6.4 # via apache-airflow-providers-apache-hive +pyjwt==1.7.1 # via apache-airflow, flask-appbuilder, flask-jwt-extended +pyopenssl==20.0.1 # via apache-airflow-providers-google pyparsing==2.4.7 # via httplib2, packaging pyrsistent==0.18.0 # via jsonschema python-daemon==2.3.0 # via apache-airflow @@ -161,43 +189,47 @@ python-dateutil==2.8.1 # via alembic, apache-airflow, botocore, croniter, fla python-editor==1.0.4 # via alembic python-nvd3==0.15.0 # via apache-airflow python-slugify==4.0.1 # via apache-airflow, python-nvd3 -python3-openid==3.2.0 # via flask-openid -pytz==2021.1 # via -r requirements.in, babel, celery, flask-babel, flower, google-api-core, pandas, tzlocal +python3-openid==3.2.0 # via apache-airflow, flask-openid +pytz==2021.1 # via -r requirements.in, babel, celery, flask-babel, flower, google-api-core, pandas pytzdata==2020.1 # via pendulum -pyyaml==5.4.1 # via apispec, flask-swagger, google-ads, kubernetes, libcst +pyyaml==5.4.1 # via apache-airflow, apispec, clickclick, google-ads, kubernetes, libcst, openapi-spec-validator redis==3.5.3 # via -r requirements.in requests-oauthlib==1.1.0 # via apache-airflow, flask-oauthlib, google-auth-oauthlib, kubernetes -requests==2.23.0 # via -r requirements.in, apache-airflow, datadog, google-api-core, google-cloud-bigquery, google-cloud-storage, kubernetes, requests-oauthlib +requests==2.23.0 # via -r requirements.in, airflow-provider-fivetran, apache-airflow-providers-http, datadog, google-api-core, google-cloud-bigquery, google-cloud-storage, kubernetes, requests-oauthlib retrying==1.3.3 # via -r requirements.in +rfc3986[idna2008]==1.5.0 # via httpx +rich==10.9.0 # via apache-airflow rsa==4.7.2 # via google-auth s3transfer==0.3.7 # via boto3 sasl==0.3.1 # via pyhive setproctitle==1.2.2 # via apache-airflow shelljob==0.5.6 # via -r requirements.in -six==1.16.0 # via bcrypt, eventlet, flask-jwt-extended, google-api-core, google-api-python-client, google-auth, google-auth-httplib2, google-cloud-core, google-resumable-media, grpcio, jsonschema, kubernetes, prison, protobuf, pyopenssl, python-dateutil, retrying, sasl, sqlalchemy-utils, tenacity, thrift, thrift-sasl -sqlalchemy-jsonfield==0.9.0 # via apache-airflow +six==1.16.0 # via bcrypt, eventlet, flask-jwt-extended, google-api-core, google-api-python-client, google-auth, google-auth-httplib2, google-cloud-core, google-resumable-media, grpcio, isodate, jsonschema, kubernetes, openapi-schema-validator, openapi-spec-validator, prison, protobuf, pyopenssl, python-dateutil, retrying, sasl, sqlalchemy-utils, tenacity, thrift, thrift-sasl, virtualenv +sniffio==1.2.0 # via anyio, httpcore, httpx +sqlalchemy-jsonfield==1.0.0 # via apache-airflow sqlalchemy-utils==0.37.8 # via flask-appbuilder -sqlalchemy==1.3.15 # via -r requirements.in, alembic, apache-airflow, flask-sqlalchemy, marshmallow-sqlalchemy, sqlalchemy-jsonfield, sqlalchemy-utils +sqlalchemy==1.3.24 # via -r requirements.in, alembic, apache-airflow, flask-appbuilder, flask-sqlalchemy, marshmallow-sqlalchemy, sqlalchemy-jsonfield, sqlalchemy-utils statsd==3.3.0 # via apache-airflow +swagger-ui-bundle==0.0.8 # via apache-airflow tabulate==0.8.9 # via apache-airflow -tenacity==4.12.0 # via apache-airflow +tenacity==6.2.0 # via apache-airflow +termcolor==1.1.0 # via apache-airflow text-unidecode==1.3 # via python-slugify thrift-sasl==0.4.3 # via pyhive -thrift==0.13.0 # via apache-airflow, hmsclient, pyhive, thrift-sasl -tornado==5.1.1 # via apache-airflow, flower -typing-extensions==3.10.0.0 # via apache-airflow, libcst, typing-inspect +thrift==0.13.0 # via apache-airflow-providers-apache-hive, hmsclient, pyhive, thrift-sasl +tornado==5.1.1 # via flower +typing-extensions==3.10.0.0 # via anyio, apache-airflow, libcst, rich, typing-inspect typing-inspect==0.7.1 # via libcst -tzlocal==1.5.1 # via apache-airflow, pendulum unicodecsv==0.14.1 # via apache-airflow uritemplate==3.0.1 # via google-api-python-client urllib3==1.25.11 # via -r requirements.in, botocore, kubernetes, requests -vine==1.3.0 # via amqp, apache-airflow, celery, flower -watchtower==0.7.3 # via apache-airflow-backport-providers-amazon +vine==1.3.0 # via amqp, apache-airflow-providers-celery, celery, flower +virtualenv==20.7.2 # via nox +watchtower==1.0.6 # via apache-airflow-providers-amazon websocket-client==1.1.0 # via -r requirements.in, kubernetes -werkzeug==0.16.0 # via -r requirements.in, apache-airflow, flask, flask-caching, flask-jwt-extended +werkzeug==1.0.1 # via -r requirements.in, apache-airflow, flask, flask-jwt-extended wtforms==2.3.3 # via flask-admin, flask-wtf zipp==3.5.0 # via importlib-metadata, importlib-resources -zope.deprecation==4.4.0 # via apache-airflow zope.event==4.5.0 # via gevent zope.interface==5.4.0 # via gevent diff --git a/webserver_config.py b/webserver_config.py index 824d55103..118db6034 100644 --- a/webserver_config.py +++ b/webserver_config.py @@ -70,14 +70,14 @@ 'token_key':'access_token', 'icon':'fa-google', 'remote_app': { - 'base_url':'https://www.googleapis.com/oauth2/v2/', - 'request_token_params':{ + 'api_base_url':'https://www.googleapis.com/oauth2/v2/', + 'client_kwargs':{ 'scope': 'email profile' }, 'access_token_url':'https://accounts.google.com/o/oauth2/token', 'authorize_url':'https://accounts.google.com/o/oauth2/auth', 'request_token_url': None, - 'consumer_key': GOOGLE_KEY, - 'consumer_secret': GOOGLE_SECRET, + 'client_id': GOOGLE_KEY, + 'client_secret': GOOGLE_SECRET, } }]