-
Notifications
You must be signed in to change notification settings - Fork 28
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Replace Sentry metrics with Prometheus metrics #918
Changes from 32 commits
c8685db
52dd85c
2909aa6
91a8578
ac58dbf
6d72497
5af7369
af3f0f0
4c79e54
cfbe611
9fe6d4b
66f02a9
3f86b59
e9ef0b4
9ff1ba8
79d6d52
2dfa87b
2e71531
eaf9a41
4d2c20e
61865f0
27f74ad
00bd746
13086fd
7077928
08b9256
44b2c12
3ad8aff
bb7757a
e222298
e052728
2386132
919ee71
c691c6e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,8 +19,7 @@ | |
) | ||
from graphql import DocumentNode | ||
from sentry_sdk import capture_exception | ||
from sentry_sdk import metrics as sentry_metrics | ||
from shared.metrics import Counter, Histogram | ||
from shared.metrics import Counter, Histogram, inc_counter | ||
|
||
from codecov.commands.exceptions import BaseException | ||
from codecov.commands.executor import get_executor_from_request | ||
|
@@ -51,6 +50,17 @@ | |
buckets=[0.05, 0.1, 0.25, 0.5, 0.75, 1, 2, 5, 10, 30], | ||
) | ||
|
||
GQL_REQUEST_MADE_COUNTER = Counter( | ||
"api_gql_requests_made", | ||
"Total API GQL requests made", | ||
["path"], | ||
) | ||
|
||
GQL_ERROR_TYPE_COUNTER = Counter( | ||
"api_gql_errors", | ||
"Number of times API GQL endpoint failed with an exception by type", | ||
["error_type", "path"], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we include a comment on when to use existing |
||
) | ||
|
||
# covers named and 3 unnamed operations (see graphql_api/types/query/query.py) | ||
GQL_TYPE_AND_NAME_PATTERN = r"^(query|mutation|subscription)(?:\(\$input:|) (\w+)(?:\(| \(|{| {|!)|^(?:{) (me|owner|config)(?:\(| |{)" | ||
|
@@ -109,9 +119,13 @@ def request_started(self, context): | |
""" | ||
self.set_type_and_name(query=context["clean_query"]) | ||
self.start_timestamp = time.perf_counter() | ||
GQL_HIT_COUNTER.labels( | ||
operation_type=self.operation_type, operation_name=self.operation_name | ||
).inc() | ||
inc_counter( | ||
GQL_HIT_COUNTER, | ||
labels=dict( | ||
operation_type=self.operation_type, | ||
operation_name=self.operation_name, | ||
), | ||
) | ||
|
||
def request_finished(self, context): | ||
""" | ||
|
@@ -226,10 +240,12 @@ async def post(self, request, *args, **kwargs): | |
"user": request.user, | ||
} | ||
log.info("GraphQL Request", extra=log_data) | ||
sentry_metrics.incr("graphql.info.request_made", tags={"path": req_path}) | ||
|
||
inc_counter(GQL_REQUEST_MADE_COUNTER, labels=dict(path=req_path)) | ||
if self._check_ratelimit(request=request): | ||
sentry_metrics.incr("graphql.error.rate_limit", tags={"path": req_path}) | ||
inc_counter( | ||
GQL_ERROR_TYPE_COUNTER, | ||
labels=dict(error_type="rate_limit", path=req_path), | ||
) | ||
return JsonResponse( | ||
data={ | ||
"status": 429, | ||
|
@@ -245,7 +261,10 @@ async def post(self, request, *args, **kwargs): | |
data = json.loads(content) | ||
|
||
if "errors" in data: | ||
sentry_metrics.incr("graphql.error.all", tags={"path": req_path}) | ||
inc_counter( | ||
GQL_ERROR_TYPE_COUNTER, | ||
labels=dict(error_type="all", path=req_path), | ||
) | ||
try: | ||
if data["errors"][0]["extensions"]["cost"]: | ||
costs = data["errors"][0]["extensions"]["cost"] | ||
|
@@ -257,9 +276,12 @@ async def post(self, request, *args, **kwargs): | |
request_body=req_body, | ||
), | ||
) | ||
sentry_metrics.incr( | ||
"graphql.error.query_cost_exceeded", | ||
tags={"path": req_path}, | ||
inc_counter( | ||
GQL_ERROR_TYPE_COUNTER, | ||
labels=dict( | ||
error_type="query_cost_exceeded", | ||
path=req_path, | ||
), | ||
) | ||
return HttpResponseBadRequest( | ||
JsonResponse("Your query is too costly.") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -790,14 +790,15 @@ def get_version_from_headers(headers): | |
return "unknown-user-agent" | ||
|
||
|
||
def generate_upload_sentry_metrics_tags( | ||
def generate_upload_prometheus_metrics_labels( | ||
action, | ||
request, | ||
is_shelter_request, | ||
endpoint: Optional[str] = None, | ||
repository: Optional[Repository] = None, | ||
position: Optional[str] = None, | ||
upload_version: Optional[str] = None, | ||
fill_labels: bool = True, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: A name like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree, this would be much clearer. I've opted with using |
||
): | ||
metrics_tags = dict( | ||
agent=get_agent_from_headers(request.headers), | ||
|
@@ -806,13 +807,23 @@ def generate_upload_sentry_metrics_tags( | |
endpoint=endpoint, | ||
is_using_shelter="yes" if is_shelter_request else "no", | ||
) | ||
|
||
repo_visibility = None | ||
if repository: | ||
metrics_tags["repo_visibility"] = ( | ||
"private" if repository.private is True else "public" | ||
) | ||
if position: | ||
metrics_tags["position"] = position | ||
if upload_version: | ||
metrics_tags["upload_version"] = upload_version | ||
repo_visibility = "private" if repository.private else "public" | ||
|
||
optional_fields = { | ||
"repo_visibility": repo_visibility, | ||
"position": position, | ||
"upload_version": upload_version, | ||
} | ||
Comment on lines
+815
to
+819
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code doesn't look like it's doing anything different from the original code, and the original code is a bit simpler. What is the reason to create an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the original code, the The original code does not create the fields if the corresponding arguments are not provided. However, the prometheus-client in Python raises an error if we initialize a metric with a certain number of labels but then call it with some empty labels. I introduced i.e. old code,
in the new code, it will return
If we still want the first dict to be returned, we can call generate tags with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see. I think that the new code will actually return
where the values for the three extra fields are of None type (not string). Is this still fine? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I checked and that's fine, in prometheus it will interpret it as a string |
||
|
||
metrics_tags.update( | ||
{ | ||
field: value | ||
for field, value in optional_fields.items() | ||
if value or fill_labels | ||
} | ||
) | ||
|
||
return metrics_tags |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from shared.metrics import Counter | ||
|
||
API_UPLOAD_COUNTER = Counter( | ||
"api_upload", | ||
"Total API upload endpoint requests", | ||
[ | ||
"agent", | ||
"version", | ||
"action", | ||
"endpoint", | ||
"is_using_shelter", | ||
"repo_visibility", | ||
"position", | ||
"upload_version", | ||
], | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, I like this pattern of pulling it out into a separate file! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Couldn't quite tell - what's the intended difference between this one (
GQL_REQUEST_MADE_COUNTER
) andGQL_HIT_COUNTER
in above (line 34)? Any opportunities to combine?Also a nit but it seems like we use "Total" in the description when it's a
Histogram
notCounter
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added
GQL_REQUEST_MADE_COUNTER
so that thepost
function inAsyncGraphqlView
class can increment this metric with the labelpath=req_path
, and the other metricGQL_HIT_COUNTER
is called when the actual gql API is called through therequest_started
extension: https://ariadnegraphql.org/docs/extensions. If we want to combine this, then we would have to givereq_path
to theQueryMetricsExtension
in some way, I was thinking of setting aself.path
attribute to theAsyncGraphqlView
class but since it's handling many async requests I don't think that would work. Do you know of a way to implement this?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe path is available in the
context
argument likeinfo.context["request"].url.path
, if that's what you mean?If this will be a duplicate of the information in
GQL_HIT_COUNTER
then it seems like removing/combining makes sense. But since it sounds like it's meant to be another step in the lifecycle of a request, we can use it for debugging, for example comparing numbers against theGQL_REQUEST_MADE_COUNTER
. Actually I like the location of theGQL_REQUEST_MADE_COUNTER
better than the other one that already existed, so we can just leave it! I saw you were just converting the sentry one to prometheus anyway so we can just proceed.