Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace Sentry metrics with Prometheus metrics #918

Merged
merged 34 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
c8685db
Update `shared` and remove statsd metrics usage
Swatinem Oct 22, 2024
52dd85c
remove usage of mock_metrics
Swatinem Oct 22, 2024
2909aa6
feat: add Prometheus metrics for uploads
tony-codecov Oct 22, 2024
91a8578
Replace GQL Sentry Metrics with Prometheus metrics
tony-codecov Oct 22, 2024
ac58dbf
validate: Replace Sentry metrics with Prometheus metrics
tony-codecov Oct 22, 2024
6d72497
upload: Modify generate metrics labels helper function
tony-codecov Oct 22, 2024
5af7369
commits: replace Sentry metrics with Prometheus metrics
tony-codecov Oct 22, 2024
af3f0f0
Add fill_labels=False to avoid redundant labels
tony-codecov Oct 22, 2024
4c79e54
empty_upload: Replace Sentry metrics with Prometheus metrics
tony-codecov Oct 22, 2024
cfbe611
legacy upload: replace Sentry metrics with Prometheus metrics
tony-codecov Oct 22, 2024
9fe6d4b
create_upload: replace Sentry metrics with Prometheus metrics
tony-codecov Oct 22, 2024
66f02a9
upload_complete: replace Sentry metrics with Prometheus metrics
tony-codecov Oct 22, 2024
3f86b59
test_results: replace Sentry metrics with Prometheus metrics
tony-codecov Oct 22, 2024
e9ef0b4
reports: replace Sentry metrics with Prometheus metrics
tony-codecov Oct 22, 2024
9ff1ba8
Fix incr() and inc() typos
tony-codecov Oct 22, 2024
79d6d52
Merge branch 'main' into tony/prometheus-metrics
tony-codecov Oct 22, 2024
2dfa87b
fix: Lints
tony-codecov Oct 22, 2024
2e71531
More syntax and lint fixes
tony-codecov Oct 22, 2024
eaf9a41
helpers: prometheus metrics labels function linting fix
tony-codecov Oct 22, 2024
4d2c20e
reports: Make mock metrics name consistent
tony-codecov Oct 22, 2024
61865f0
Use inc_counter in shared.metrics
tony-codecov Oct 23, 2024
27f74ad
bundle_analysis: remove try block
tony-codecov Oct 23, 2024
00bd746
Merge branch 'main' into tony/prometheus-metrics
tony-codecov Oct 23, 2024
13086fd
Function name clarity ("labels" instead of "tags")
tony-codecov Oct 23, 2024
7077928
fix: Lints
tony-codecov Oct 23, 2024
08b9256
Bump shared version
tony-codecov Oct 23, 2024
44b2c12
Merge remote-tracking branch 'origin/swatinem/update-shared' into ton…
tony-codecov Oct 23, 2024
3ad8aff
Remove self.test_instances[0].test.failure_rate
tony-codecov Oct 23, 2024
bb7757a
Remove commits_where_fail
tony-codecov Oct 23, 2024
e222298
Merge branch 'main' into tony/prometheus-metrics
tony-codecov Oct 24, 2024
e052728
Merge branch 'main' into tony/prometheus-metrics
tony-codecov Oct 25, 2024
2386132
Merge branch 'main' into tony/prometheus-metrics
tony-codecov Oct 28, 2024
919ee71
Use include_empty_labels for extra clarity
tony-codecov Oct 28, 2024
c691c6e
Merge branch 'main' into tony/prometheus-metrics
tony-codecov Oct 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions graphql_api/tests/test_views.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from unittest.mock import Mock, call, patch
from unittest.mock import Mock, patch

from ariadne import ObjectType, make_executable_schema
from ariadne.validation import cost_directive
Expand Down Expand Up @@ -201,11 +201,12 @@ async def test_query_metrics_extension_set_type_and_name_timeout(
assert extension.operation_type == "unknown_type"
assert extension.operation_name == "unknown_name"

@patch("sentry_sdk.metrics.incr")
@patch("graphql_api.views.GQL_REQUEST_MADE_COUNTER.labels")
@patch("graphql_api.views.GQL_ERROR_TYPE_COUNTER.labels")
@patch("graphql_api.views.AsyncGraphqlView._check_ratelimit")
@override_settings(DEBUG=False, GRAPHQL_RATE_LIMIT_RPM=1000)
async def test_when_rate_limit_reached(
self, mocked_check_ratelimit, mocked_sentry_incr
self, mocked_check_ratelimit, mocked_error_counter, mocked_request_counter
):
schema = generate_cost_test_schema()
mocked_check_ratelimit.return_value = True
Expand All @@ -217,11 +218,10 @@ async def test_when_rate_limit_reached(
== "It looks like you've hit the rate limit of 1000 req/min. Try again later."
)

expected_calls = [
call("graphql.info.request_made", tags={"path": "/graphql/gh"}),
call("graphql.error.rate_limit", tags={"path": "/graphql/gh"}),
]
mocked_sentry_incr.assert_has_calls(expected_calls)
mocked_error_counter.assert_called_with(
error_type="rate_limit", path="/graphql/gh"
)
mocked_request_counter.assert_called_with(path="/graphql/gh")

@override_settings(
DEBUG=False, GRAPHQL_RATE_LIMIT_RPM=0, GRAPHQL_RATE_LIMIT_ENABLED=False
Expand Down
46 changes: 34 additions & 12 deletions graphql_api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
)
from graphql import DocumentNode
from sentry_sdk import capture_exception
from sentry_sdk import metrics as sentry_metrics
from shared.metrics import Counter, Histogram
from shared.metrics import Counter, Histogram, inc_counter

from codecov.commands.exceptions import BaseException
from codecov.commands.executor import get_executor_from_request
Expand Down Expand Up @@ -51,6 +50,17 @@
buckets=[0.05, 0.1, 0.25, 0.5, 0.75, 1, 2, 5, 10, 30],
)

GQL_REQUEST_MADE_COUNTER = Counter(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couldn't quite tell - what's the intended difference between this one (GQL_REQUEST_MADE_COUNTER) and GQL_HIT_COUNTER in above (line 34)? Any opportunities to combine?

Also a nit but it seems like we use "Total" in the description when it's a Histogram not Counter

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added GQL_REQUEST_MADE_COUNTER so that the post function in AsyncGraphqlView class can increment this metric with the label path=req_path, and the other metric GQL_HIT_COUNTER is called when the actual gql API is called through the request_started extension: https://ariadnegraphql.org/docs/extensions. If we want to combine this, then we would have to give req_path to the QueryMetricsExtension in some way, I was thinking of setting a self.path attribute to the AsyncGraphqlView class but since it's handling many async requests I don't think that would work. Do you know of a way to implement this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe path is available in the context argument like info.context["request"].url.path, if that's what you mean?

If this will be a duplicate of the information in GQL_HIT_COUNTER then it seems like removing/combining makes sense. But since it sounds like it's meant to be another step in the lifecycle of a request, we can use it for debugging, for example comparing numbers against the GQL_REQUEST_MADE_COUNTER. Actually I like the location of the GQL_REQUEST_MADE_COUNTER better than the other one that already existed, so we can just leave it! I saw you were just converting the sentry one to prometheus anyway so we can just proceed.

"api_gql_requests_made",
"Total API GQL requests made",
["path"],
)

GQL_ERROR_TYPE_COUNTER = Counter(
"api_gql_errors",
"Number of times API GQL endpoint failed with an exception by type",
["error_type", "path"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we include a comment on when to use existing GQL_ERROR_COUNTER vs. new GQL_ERROR_TYPE_COUNTER for a particular use case? Or is this something we could potentially combine?

)

# covers named and 3 unnamed operations (see graphql_api/types/query/query.py)
GQL_TYPE_AND_NAME_PATTERN = r"^(query|mutation|subscription)(?:\(\$input:|) (\w+)(?:\(| \(|{| {|!)|^(?:{) (me|owner|config)(?:\(| |{)"
Expand Down Expand Up @@ -109,9 +119,13 @@ def request_started(self, context):
"""
self.set_type_and_name(query=context["clean_query"])
self.start_timestamp = time.perf_counter()
GQL_HIT_COUNTER.labels(
operation_type=self.operation_type, operation_name=self.operation_name
).inc()
inc_counter(
GQL_HIT_COUNTER,
labels=dict(
operation_type=self.operation_type,
operation_name=self.operation_name,
),
)

def request_finished(self, context):
"""
Expand Down Expand Up @@ -226,10 +240,12 @@ async def post(self, request, *args, **kwargs):
"user": request.user,
}
log.info("GraphQL Request", extra=log_data)
sentry_metrics.incr("graphql.info.request_made", tags={"path": req_path})

inc_counter(GQL_REQUEST_MADE_COUNTER, labels=dict(path=req_path))
if self._check_ratelimit(request=request):
sentry_metrics.incr("graphql.error.rate_limit", tags={"path": req_path})
inc_counter(
GQL_ERROR_TYPE_COUNTER,
labels=dict(error_type="rate_limit", path=req_path),
)
return JsonResponse(
data={
"status": 429,
Expand All @@ -245,7 +261,10 @@ async def post(self, request, *args, **kwargs):
data = json.loads(content)

if "errors" in data:
sentry_metrics.incr("graphql.error.all", tags={"path": req_path})
inc_counter(
GQL_ERROR_TYPE_COUNTER,
labels=dict(error_type="all", path=req_path),
)
try:
if data["errors"][0]["extensions"]["cost"]:
costs = data["errors"][0]["extensions"]["cost"]
Expand All @@ -257,9 +276,12 @@ async def post(self, request, *args, **kwargs):
request_body=req_body,
),
)
sentry_metrics.incr(
"graphql.error.query_cost_exceeded",
tags={"path": req_path},
inc_counter(
GQL_ERROR_TYPE_COUNTER,
labels=dict(
error_type="query_cost_exceeded",
path=req_path,
),
)
return HttpResponseBadRequest(
JsonResponse("Your query is too costly.")
Expand Down
27 changes: 19 additions & 8 deletions upload/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,14 +790,15 @@ def get_version_from_headers(headers):
return "unknown-user-agent"


def generate_upload_sentry_metrics_tags(
def generate_upload_prometheus_metrics_labels(
action,
request,
is_shelter_request,
endpoint: Optional[str] = None,
repository: Optional[Repository] = None,
position: Optional[str] = None,
upload_version: Optional[str] = None,
fill_labels: bool = True,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: A name like fill_optional_labels or include_empty_labels may be more descriptive to someone who wants to decide what value to pass here.
Also we may include a comment by the optional_fields dict on what was discussed in the convo below so someone doesn't inadvertently remove one of these optional fields and it breaks a metric somewhere else due to the missing label

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, this would be much clearer. I've opted with using include_empty_labels.

):
metrics_tags = dict(
agent=get_agent_from_headers(request.headers),
Expand All @@ -806,13 +807,23 @@ def generate_upload_sentry_metrics_tags(
endpoint=endpoint,
is_using_shelter="yes" if is_shelter_request else "no",
)

repo_visibility = None
if repository:
metrics_tags["repo_visibility"] = (
"private" if repository.private is True else "public"
)
if position:
metrics_tags["position"] = position
if upload_version:
metrics_tags["upload_version"] = upload_version
repo_visibility = "private" if repository.private else "public"

optional_fields = {
"repo_visibility": repo_visibility,
"position": position,
"upload_version": upload_version,
}
Comment on lines +815 to +819
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code doesn't look like it's doing anything different from the original code, and the original code is a bit simpler. What is the reason to create an optional_fields dict here?

Copy link
Contributor Author

@tony-codecov tony-codecov Oct 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the original code, the optional_fields are not created if we don't provide them in the arguments, and python's prometheus-client errors if we initialize a metric with N labels, but call them with some empty labels. This change allows it to create these fields that are filled with None such that they are present in the labels dict.

The original code does not create the fields if the corresponding arguments are not provided. However, the prometheus-client in Python raises an error if we initialize a metric with a certain number of labels but then call it with some empty labels. I introduced optional_fields dict here to ensure that these fields are always present in the labels dictionary, even if they are filled with None.

i.e. old code, generate_tags(..., endpoint="some_endpoint") returns:

{
   ...
   endpoint: "some_endpoint",
}

in the new code, it will return

{
   ...
   endpoint: "some_endpoint",
   repo_visibility: "None",
   position: "None",
   upload_version: "None",
}

If we still want the first dict to be returned, we can call generate tags with fill_labels=False.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see. I think that the new code will actually return

{
   ...
   endpoint: "some_endpoint",
   repo_visibility: None,
   position: None,
   upload_version: None,
}

where the values for the three extra fields are of None type (not string). Is this still fine?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked and that's fine, in prometheus it will interpret it as a string "None".


metrics_tags.update(
{
field: value
for field, value in optional_fields.items()
if value or fill_labels
}
)

return metrics_tags
16 changes: 16 additions & 0 deletions upload/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from shared.metrics import Counter

API_UPLOAD_COUNTER = Counter(
"api_upload",
"Total API upload endpoint requests",
[
"agent",
"version",
"action",
"endpoint",
"is_using_shelter",
"repo_visibility",
"position",
"upload_version",
],
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, I like this pattern of pulling it out into a separate file!
I see the other spots in this PR are following the existing pattern of keeping it in the views.py file because it would be a bigger lift to move those around, but this does seem like a nicer system going forward where possible!

8 changes: 4 additions & 4 deletions upload/tests/views/test_commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ def test_commit_github_oidc_auth(mock_jwks_client, mock_jwt_decode, db, mocker):
private=False, author__username="codecov", name="the_repo"
)
mocked_call = mocker.patch.object(TaskService, "update_commit")
mock_sentry_metrics = mocker.patch("upload.views.commits.sentry_metrics.incr")
mock_prometheus_metrics = mocker.patch("upload.metrics.API_UPLOAD_COUNTER.labels")
mock_jwt_decode.return_value = {
"repository": f"url/{repository.name}",
"repository_owner": repository.author.username,
Expand Down Expand Up @@ -374,15 +374,15 @@ def test_commit_github_oidc_auth(mock_jwks_client, mock_jwt_decode, db, mocker):
}
assert expected_response == response_json
mocked_call.assert_called_with(commitid="commit_sha", repoid=repository.repoid)
mock_sentry_metrics.assert_called_with(
"upload",
tags={
mock_prometheus_metrics.assert_called_with(
**{
"agent": "cli",
"version": "0.4.7",
"action": "coverage",
"endpoint": "create_commit",
"repo_visibility": "public",
"is_using_shelter": "no",
"position": "end",
"upload_version": None,
},
)
8 changes: 4 additions & 4 deletions upload/tests/views/test_empty_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_empty_upload_with_yaml_ignored_files(
mocker.patch.object(
CanDoCoverageUploadsPermission, "has_permission", return_value=True
)
mock_sentry_metrics = mocker.patch("upload.views.empty_upload.sentry_metrics.incr")
mock_prometheus_metrics = mocker.patch("upload.metrics.API_UPLOAD_COUNTER.labels")
mock_final_yaml.return_value = UserYaml(
{
"ignore": [
Expand Down Expand Up @@ -93,16 +93,16 @@ def test_empty_upload_with_yaml_ignored_files(
notify_mock.assert_called_once_with(
repoid=repository.repoid, commitid=commit.commitid, empty_upload="pass"
)
mock_sentry_metrics.assert_called_with(
"upload",
tags={
mock_prometheus_metrics.assert_called_with(
**{
"agent": "cli",
"version": "0.4.7",
"action": "coverage",
"endpoint": "empty_upload",
"repo_visibility": "private",
"is_using_shelter": "no",
"position": "end",
"upload_version": None,
},
)

Expand Down
16 changes: 8 additions & 8 deletions upload/tests/views/test_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_reports_get_not_allowed(client, mocker, db):

def test_reports_post(client, db, mocker):
mocked_call = mocker.patch.object(TaskService, "preprocess_upload")
mock_sentry_metrics = mocker.patch("upload.views.reports.sentry_metrics.incr")
mock_prometheus_metrics = mocker.patch("upload.metrics.API_UPLOAD_COUNTER.labels")
repository = RepositoryFactory(
name="the_repo", author__username="codecov", author__service="github"
)
Expand All @@ -65,16 +65,16 @@ def test_reports_post(client, db, mocker):
commit_id=commit.id, code="code1", report_type=CommitReport.ReportType.COVERAGE
).exists()
mocked_call.assert_called_with(repository.repoid, commit.commitid, "code1")
mock_sentry_metrics.assert_called_with(
"upload",
tags={
mock_prometheus_metrics.assert_called_with(
**{
"agent": "cli",
"version": "0.4.7",
"action": "coverage",
"endpoint": "create_report",
"repo_visibility": "private",
"is_using_shelter": "no",
"position": "end",
"upload_version": None,
},
)

Expand Down Expand Up @@ -343,7 +343,7 @@ def test_reports_results_post_successful_github_oidc_auth(
mock_jwks_client, mock_jwt_decode, client, db, mocker
):
mocked_task = mocker.patch("services.task.TaskService.create_report_results")
mock_sentry_metrics = mocker.patch("upload.views.reports.sentry_metrics.incr")
mock_prometheus_metrics = mocker.patch("upload.metrics.API_UPLOAD_COUNTER.labels")
mocker.patch.object(
CanDoCoverageUploadsPermission, "has_permission", return_value=True
)
Expand Down Expand Up @@ -383,16 +383,16 @@ def test_reports_results_post_successful_github_oidc_auth(
report_id=commit_report.id,
).exists()
mocked_task.assert_called_once()
mock_sentry_metrics.assert_called_with(
"upload",
tags={
mock_prometheus_metrics.assert_called_with(
**{
"agent": "cli",
"version": "0.4.7",
"action": "coverage",
"endpoint": "create_report_results",
"repo_visibility": "private",
"is_using_shelter": "no",
"position": "end",
"upload_version": None,
},
)

Expand Down
8 changes: 4 additions & 4 deletions upload/tests/views/test_test_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

def test_upload_test_results(db, client, mocker, mock_redis):
upload = mocker.patch.object(TaskService, "upload")
mock_sentry_metrics = mocker.patch("upload.views.test_results.metrics.incr")
mock_prometheus_metrics = mocker.patch("upload.metrics.API_UPLOAD_COUNTER.labels")
create_presigned_put = mocker.patch(
"services.archive.StorageService.create_presigned_put",
return_value="test-presigned-put",
Expand Down Expand Up @@ -100,16 +100,16 @@ def test_upload_test_results(db, client, mocker, mock_redis):
report_code=None,
report_type="test_results",
)
mock_sentry_metrics.assert_called_with(
"upload",
tags={
mock_prometheus_metrics.assert_called_with(
**{
"agent": "cli",
"version": "0.4.7",
"action": "test_results",
"endpoint": "test_results",
"repo_visibility": "private",
"is_using_shelter": "no",
"position": "end",
"upload_version": None,
},
)

Expand Down
10 changes: 4 additions & 6 deletions upload/tests/views/test_upload_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def test_upload_completion_view_processed_uploads(mocked_manual_trigger, db, moc
mocker.patch.object(
CanDoCoverageUploadsPermission, "has_permission", return_value=True
)
mock_sentry_metrics = mocker.patch(
"upload.views.upload_completion.sentry_metrics.incr"
)
mock_prometheus_metrics = mocker.patch("upload.metrics.API_UPLOAD_COUNTER.labels")
repository = RepositoryFactory(
name="the_repo", author__username="codecov", author__service="github"
)
Expand Down Expand Up @@ -88,16 +86,16 @@ def test_upload_completion_view_processed_uploads(mocked_manual_trigger, db, moc
"uploads_error": 0,
}
mocked_manual_trigger.assert_called_once_with(repository.repoid, commit.commitid)
mock_sentry_metrics.assert_called_with(
"upload",
tags={
mock_prometheus_metrics.assert_called_with(
**{
"agent": "cli",
"version": "0.4.7",
"action": "coverage",
"endpoint": "upload_complete",
"repo_visibility": "private",
"is_using_shelter": "no",
"position": "end",
"upload_version": None,
},
)

Expand Down
Loading
Loading