Skip to content

Commit

Permalink
(feat): Add opensearch logging integration (apache#41799)
Browse files Browse the repository at this point in the history
* Add feature to read log from opensearch
  • Loading branch information
Owen-CH-Leung authored Sep 21, 2024
1 parent 7628d47 commit ba1c602
Show file tree
Hide file tree
Showing 17 changed files with 1,956 additions and 1 deletion.
31 changes: 30 additions & 1 deletion airflow/config_templates/airflow_local_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@

if REMOTE_LOGGING:
ELASTICSEARCH_HOST: str | None = conf.get("elasticsearch", "HOST")

OPENSEARCH_HOST: str | None = conf.get("opensearch", "HOST")
# Storage bucket URL for remote logging
# S3 buckets should start with "s3://"
# Cloudwatch log groups should start with "cloudwatch://"
Expand Down Expand Up @@ -330,6 +330,35 @@
}

DEFAULT_LOGGING_CONFIG["handlers"].update(ELASTIC_REMOTE_HANDLERS)
elif OPENSEARCH_HOST:
OPENSEARCH_END_OF_LOG_MARK: str = conf.get_mandatory_value("opensearch", "END_OF_LOG_MARK")
OPENSEARCH_PORT: str = conf.get_mandatory_value("opensearch", "PORT")
OPENSEARCH_USERNAME: str = conf.get_mandatory_value("opensearch", "USERNAME")
OPENSEARCH_PASSWORD: str = conf.get_mandatory_value("opensearch", "PASSWORD")
OPENSEARCH_WRITE_STDOUT: bool = conf.getboolean("opensearch", "WRITE_STDOUT")
OPENSEARCH_JSON_FORMAT: bool = conf.getboolean("opensearch", "JSON_FORMAT")
OPENSEARCH_JSON_FIELDS: str = conf.get_mandatory_value("opensearch", "JSON_FIELDS")
OPENSEARCH_HOST_FIELD: str = conf.get_mandatory_value("opensearch", "HOST_FIELD")
OPENSEARCH_OFFSET_FIELD: str = conf.get_mandatory_value("opensearch", "OFFSET_FIELD")

OPENSEARCH_REMOTE_HANDLERS: dict[str, dict[str, str | bool | None]] = {
"task": {
"class": "airflow.providers.opensearch.log.os_task_handler.OpensearchTaskHandler",
"formatter": "airflow",
"base_log_folder": str(os.path.expanduser(BASE_LOG_FOLDER)),
"end_of_log_mark": OPENSEARCH_END_OF_LOG_MARK,
"host": OPENSEARCH_HOST,
"port": OPENSEARCH_PORT,
"username": OPENSEARCH_USERNAME,
"password": OPENSEARCH_PASSWORD,
"write_stdout": OPENSEARCH_WRITE_STDOUT,
"json_format": OPENSEARCH_JSON_FORMAT,
"json_fields": OPENSEARCH_JSON_FIELDS,
"host_field": OPENSEARCH_HOST_FIELD,
"offset_field": OPENSEARCH_OFFSET_FIELD,
},
}
DEFAULT_LOGGING_CONFIG["handlers"].update(OPENSEARCH_REMOTE_HANDLERS)
else:
raise AirflowException(
"Incorrect remote log configuration. Please check the configuration of option 'host' in "
Expand Down
24 changes: 24 additions & 0 deletions airflow/config_templates/unit_tests.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,30 @@ dag_dir_list_interval = 0
# by users.
job_heartbeat_sec = 2

[opensearch]
host =
port =
username =
password =
log_id_template =
end_of_log_mark = end_of_log
write_stdout = False
json_format = False
json_fields = asctime, filename, lineno, levelname, message
host_field = host
offset_field = offset
index_patterns = _all
index_patterns_callable =

[opensearch_configs]
http_compress = False
use_ssl = False
verify_certs = False
ssl_assert_hostname = False
ssl_show_warn = False
ca_certs =


[example_section]
# This section is used to test coercions of configuration values retrieval
string_value = 21600
Expand Down
16 changes: 16 additions & 0 deletions airflow/providers/opensearch/log/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
43 changes: 43 additions & 0 deletions airflow/providers/opensearch/log/os_json_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

from datetime import datetime

import pendulum

from airflow.utils.log.json_formatter import JSONFormatter


class OpensearchJSONFormatter(JSONFormatter):
"""Convert a log record to JSON with ISO 8601 date and time format."""

default_time_format = "%Y-%m-%dT%H:%M:%S"
default_msec_format = "%s.%03d"
default_tz_format = "%z"

def formatTime(self, record, datefmt=None):
"""Return the creation time of the LogRecord in ISO 8601 date/time format in the local time zone."""
# TODO: Use airflow.utils.timezone.from_timestamp(record.created, tz="local")
# as soon as min Airflow 2.9.0
dt = datetime.fromtimestamp(record.created, tz=pendulum.local_timezone())
s = dt.strftime(datefmt or self.default_time_format)
if self.default_msec_format:
s = self.default_msec_format % (s, record.msecs)
if self.default_tz_format:
s += dt.strftime(self.default_tz_format)
return s
168 changes: 168 additions & 0 deletions airflow/providers/opensearch/log/os_response.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

from typing import Iterator


def _wrap(val):
if isinstance(val, dict):
return AttributeDict(val)
return val


class AttributeList:
"""Helper class to provide attribute like access to List objects."""

def __init__(self, _list):
if not isinstance(_list, list):
_list = list(_list)
self._l_ = _list

def __getitem__(self, k):
"""Retrieve an item or a slice from the list. If the item is a dictionary, it is wrapped in an AttributeDict."""
val = self._l_[k]
if isinstance(val, slice):
return AttributeList(val)
return _wrap(val)

def __iter__(self):
"""Provide an iterator for the list or the dictionary."""
return (_wrap(i) for i in self._l_)

def __bool__(self):
"""Check if the list is non-empty."""
return bool(self._l_)


class AttributeDict:
"""Helper class to provide attribute like access to Dictionary objects."""

def __init__(self, d):
super().__setattr__("_d_", d)

def __getattr__(self, attr_name):
"""Retrieve an item as an attribute from the dictionary."""
try:
return self.__getitem__(attr_name)
except KeyError:
raise AttributeError(f"{self.__class__.__name__!r} object has no attribute {attr_name!r}")

def __getitem__(self, key):
"""Retrieve an item using a key from the dictionary."""
return _wrap(self._d_[key])

def to_dict(self):
return self._d_


class Hit(AttributeDict):
"""
The Hit class is used to manage and access elements in a document.
It inherits from the AttributeDict class and provides
attribute-like access to its elements, similar to a dictionary.
"""

def __init__(self, document):
data = {}
if "_source" in document:
data = document["_source"]
if "fields" in document:
data.update(document["fields"])

super().__init__(data)
super().__setattr__("meta", HitMeta(document))


class HitMeta(AttributeDict):
"""
The HitMeta class is used to manage and access metadata of a document.
This class inherits from the AttributeDict class and provides
attribute-like access to its elements.
"""

def __init__(self, document, exclude=("_source", "_fields")):
d = {k[1:] if k.startswith("_") else k: v for (k, v) in document.items() if k not in exclude}
if "type" in d:
# make sure we are consistent everywhere in python
d["doc_type"] = d.pop("type")
super().__init__(d)


class OpensearchResponse(AttributeDict):
"""
The OpensearchResponse class is used to manage and access the response from an Opensearch search.
This class can be iterated over directly to access hits in the response. Indexing the class instance
with an integer or slice will also access the hits. The class also evaluates to True
if there are any hits in the response.
The hits property returns an AttributeList of hits in the response, with each hit transformed into
an instance of the doc_class if provided.
The response parameter stores the dictionary returned by the Elasticsearch client search method.
"""

def __init__(self, search, response, doc_class=None):
super().__setattr__("_search", search)
super().__setattr__("_doc_class", doc_class)
super().__init__(response)

def __iter__(self) -> Iterator[Hit]:
"""Provide an iterator over the hits in the Elasticsearch response."""
return iter(self.hits)

def __getitem__(self, key):
"""Retrieve a specific hit or a slice of hits from the Elasticsearch response."""
if isinstance(key, (slice, int)):
return self.hits[key]
return super().__getitem__(key)

def __bool__(self):
"""Evaluate the presence of hits in the Elasticsearch response."""
return bool(self.hits)

@property
def hits(self) -> list[Hit]:
"""
This property provides access to the hits (i.e., the results) of the Opensearch response.
The hits are represented as an `AttributeList` of `Hit` instances, which allow for easy,
attribute-like access to the hit data.
The hits are lazily loaded, meaning they're not processed until this property is accessed.
Upon first access, the hits data from the response is processed using the `_get_result` method
of the associated `Search` instance (i.e. an instance from ElasticsearchTaskHandler class),
and the results are stored for future accesses.
Each hit also includes all the additional data present in the "hits" field of the response,
accessible as attributes of the hit.
"""
if not hasattr(self, "_hits"):
h = self._d_["hits"]

try:
hits = AttributeList(map(self._search._get_result, h["hits"]))
except AttributeError as e:
raise TypeError("Could not parse hits.", e)

super().__setattr__("_hits", hits)
for k in h:
setattr(self._hits, k, _wrap(h[k]))
return self._hits
Loading

0 comments on commit ba1c602

Please sign in to comment.