Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Serve repodata.json with redirect #506

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ dependencies:
- typer
- authlib
- psycopg2
- httpx=0.20.0
- httpx=0.22.0
- sqlalchemy
- sqlalchemy-utils
- sqlite
Expand Down
101 changes: 55 additions & 46 deletions quetz/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1608,10 +1608,8 @@ def serve_path(
session=Depends(get_remote_session),
dao: Dao = Depends(get_dao),
):

chunk_size = 10_000

is_package_request = path.endswith((".tar.bz2", ".conda"))
is_repodata_request = path.endswith(".json")

package_name = None
if is_package_request:
Expand All @@ -1630,65 +1628,74 @@ def serve_path(
except ValueError:
pass

# if we exclude the package from syncing, redirect to original URL
if is_package_request and channel.mirror_channel_url:
# if we exclude the package from syncing, redirect to original URL
channel_proxylist = json.loads(channel.channel_metadata).get('proxylist', [])
if channel_proxylist and package_name and package_name in channel_proxylist:
return RedirectResponse(f"{channel.mirror_channel_url}/{path}")

fsize = fmtime = fetag = None

if channel.mirror_channel_url and channel.mirror_mode == "proxy":
repository = RemoteRepository(channel.mirror_channel_url, session)
if not pkgstore.file_exists(channel.name, path):
if is_repodata_request:
# Invalidate repodata.json and current_repodata.json after channel.ttl seconds
try:
fsize, fmtime, fetag = pkgstore.get_filemetadata(channel.name, path)
cache_miss = time.time() - fmtime >= channel.ttl
except FileNotFoundError:
cache_miss = True
else:
cache_miss = not pkgstore.file_exists(channel.name, path)
if cache_miss:
download_remote_file(repository, pkgstore, channel.name, path)
elif path.endswith(".json"):
# repodata.json and current_repodata.json are cached locally
# for channel.ttl seconds
_, fmtime, _ = pkgstore.get_filemetadata(channel.name, path)
if time.time() - fmtime >= channel.ttl:
download_remote_file(repository, pkgstore, channel.name, path)

if (
is_package_request or pkgstore.kind == "LocalStore"
) and pkgstore.support_redirect:
return RedirectResponse(pkgstore.url(channel.name, path))

def iter_chunks(fid):
while True:
data = fid.read(chunk_size)
if not data:
break
yield data

if path == "" or path.endswith("/"):
path += "index.html"
package_content_iter = None

headers = {}
if accept_encoding and 'gzip' in accept_encoding and path.endswith('.json'):
# return gzipped response
try:
package_content_iter = iter_chunks(
pkgstore.serve_path(channel.name, path + '.gz')
)
path += '.gz'
headers['Content-Encoding'] = 'gzip'
headers['Content-Type'] = 'application/json'
except FileNotFoundError:
pass
fsize = fmtime = fetag = None

while not package_content_iter:
try:
package_content_iter = iter_chunks(pkgstore.serve_path(channel.name, path))
gzip_exists = (
is_repodata_request
and accept_encoding
and 'gzip' in accept_encoding
and pkgstore.file_exists(channel.name, path + ".gz")
)

# Redirect response
if (is_package_request or is_repodata_request) and pkgstore.support_redirect:
return RedirectResponse(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you confirm that this works? Since it redirects to a path with .gz attached and I am not sure mamba / conda will handle that correctly.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are probably right, will have to do more testing. I should have done more testing because I've also found two other issues. One is that Conda sets Content-Type: json which doesn't work with the GCS signature (I assume you need to sign all the request headers as well, not just the URL?). Plus, if a file is not found, the client will get a 403 from GCS although it would be more helpful to receive a 404.

pkgstore.url(channel.name, path + ".gz" if gzip_exists else path)
)

# Streaming response
def serve_file(path):
try:
return pkgstore.serve_path(channel.name, path)
except FileNotFoundError:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"{channel.name}/{path} not found",
)
except IsADirectoryError:
path += "/index.html"

fsize, fmtime, fetag = pkgstore.get_filemetadata(channel.name, path)
if gzip_exists:
path += '.gz'
package_content = serve_file(path)
headers = {
'Content-Encoding': 'gzip',
'Content-Type': 'application/json',
}
else:
if path == "" or path.endswith("/"):
path += "index.html"
package_content = serve_file(path)
else:
try:
package_content = serve_file(path)
except IsADirectoryError:
path += "/index.html"
package_content = serve_file(path)
headers = {}

if fsize is None:
# Maybe we already got (fsize, fmtime, fetag) above
fsize, fmtime, fetag = pkgstore.get_filemetadata(channel.name, path)
headers.update(
{
'Cache-Control': f'max-age={channel.ttl}',
Expand All @@ -1697,6 +1704,8 @@ def iter_chunks(fid):
'ETag': fetag,
}
)
chunk_size = 10_000
package_content_iter = iter(lambda: package_content.read(chunk_size), b"")
return StreamingResponse(package_content_iter, headers=headers)


Expand Down
2 changes: 0 additions & 2 deletions quetz/pkgstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,8 +572,6 @@ def __init__(self, config):

@property
def support_redirect(self):
# `gcsfs` currently doesnt support signing yet. Once this is implemented we
# can enable this again.
return True

@contextlib.contextmanager
Expand Down
28 changes: 28 additions & 0 deletions quetz/tests/api/test_main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,37 @@
import datetime
import io
from unittest.mock import ANY

import pytest

from quetz.metrics.db_models import PackageVersionMetric
from quetz.tasks.indexing import update_indexes


def test_index_html(package_version, channel_name, client, mocker):
def serve_path(channel_name, path):
if path.endswith("index.html"):
return io.BytesIO(b"index.html content")
else:
raise IsADirectoryError

pkgstore = mocker.Mock()
pkgstore.get_filemetadata.return_value = (0, 0, "")
pkgstore.url.side_effect = lambda chan, p: f"{chan}/{p}"
pkgstore.serve_path.side_effect = serve_path
mocker.patch("quetz.main.pkgstore", pkgstore)

for url in [
f"/get/{channel_name}",
f"/get/{channel_name}/",
f"/get/{channel_name}/index.html",
f"/get/{channel_name}/linux-64",
f"/get/{channel_name}/linux-64/",
f"/get/{channel_name}/linux-64/index.html",
]:
response = client.get(url, allow_redirects=False)
assert response.status_code == 200
assert response.text == "index.html content"


def test_get_package_list(package_version, package_name, channel_name, client):
Expand Down
52 changes: 35 additions & 17 deletions quetz/tests/test_mirror.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,6 @@ def test_synchronisation_sha(
n_new_packages,
arch,
package_version,
mocker,
):
pkgstore = config.get_package_store()
rules = Rules("", {"user_id": str(uuid.UUID(bytes=user.id))}, db)
Expand Down Expand Up @@ -521,7 +520,6 @@ def test_synchronisation_no_checksums_in_db(
n_new_packages,
arch,
package_version,
mocker,
):

package_info = '{"size": 5000, "subdirs":["noarch"]}'
Expand Down Expand Up @@ -562,7 +560,15 @@ def close(self):
assert len(versions) == n_new_packages + 1


def test_download_remote_file(client, owner, dummy_repo):
@pytest.fixture(params=[True, False])
def main_pkgstore_redirect(request, config, monkeypatch):
from quetz import main

monkeypatch.setattr(main.pkgstore, "redirect_enabled", request.param)
return request.param


def test_download_remote_file(client, owner, dummy_repo, main_pkgstore_redirect):
"""Test downloading from cache."""
response = client.get("/api/dummylogin/bartosz")
assert response.status_code == 200
Expand Down Expand Up @@ -605,7 +611,9 @@ def test_download_remote_file(client, owner, dummy_repo):
assert dummy_repo == [("http://host/test_file_2.txt")]


def test_download_remote_file_in_parallel(client, owner, dummy_repo):
def test_download_remote_file_in_parallel(
client, owner, dummy_repo, main_pkgstore_redirect
):
"""Test downloading in parallel."""
response = client.get("/api/dummylogin/bartosz")
assert response.status_code == 200
Expand Down Expand Up @@ -636,7 +644,7 @@ def get_remote_file(filename):
assert dummy_repo == [(f"http://host/{test_file}")]


def test_proxy_repodata_cached(client, owner, dummy_repo):
def test_proxy_repodata_cached(client, owner, dummy_repo, main_pkgstore_redirect):
"""Test downloading from cache."""
response = client.get("/api/dummylogin/bartosz")
assert response.status_code == 200
Expand All @@ -652,13 +660,15 @@ def test_proxy_repodata_cached(client, owner, dummy_repo):
)
assert response.status_code == 201

response = client.get("/get/proxy-channel-2/repodata.json")
assert response.status_code == 200
assert response.content == b"Hello world!"

response = client.get("/get/proxy-channel-2/repodata.json")
assert response.status_code == 200
assert response.content == b"Hello world!"
for i in range(2):
response = client.get(
"/get/proxy-channel-2/repodata.json", allow_redirects=False
)
if main_pkgstore_redirect:
assert 300 <= response.status_code < 400
else:
assert response.status_code == 200
assert response.content == b"Hello world!"

# repodata.json was cached locally and downloaded from the
# the remote only once
Expand All @@ -674,7 +684,9 @@ def test_method_not_implemented_for_proxies(client, proxy_channel):
assert "not implemented" in response.json()["detail"]


def test_api_methods_for_mirror_channels(client, mirror_channel):
def test_api_methods_for_mirror_channels(
client, mirror_channel, main_pkgstore_redirect
):
"""mirror-mode channels should have all standard API calls"""

response = client.get("/api/channels/{}/packages".format(mirror_channel.name))
Expand All @@ -683,9 +695,13 @@ def test_api_methods_for_mirror_channels(client, mirror_channel):

response = client.get(
"/get/{}/missing/path/file.json".format(mirror_channel.name),
allow_redirects=False,
)
assert response.status_code == 404
assert "file.json not found" in response.json()["detail"]
if main_pkgstore_redirect:
assert 300 <= response.status_code < 400
else:
assert response.status_code == 404
assert "file.json not found" in response.json()["detail"]


@pytest.mark.parametrize(
Expand Down Expand Up @@ -820,7 +836,9 @@ def test_add_and_register_mirror(auth_client, dummy_session_mock):
]
],
)
def test_wrong_package_format(client, dummy_repo, owner, job_supervisor):
def test_wrong_package_format(
client, dummy_repo, owner, job_supervisor, main_pkgstore_redirect
):

response = client.get("/api/dummylogin/bartosz")
assert response.status_code == 200
Expand Down Expand Up @@ -1062,7 +1080,7 @@ def test_includelist_and_excludelist_mirror_channel(owner, client):


@pytest.mark.parametrize("mirror_mode", ["proxy", "mirror"])
def test_proxylist_mirror_channel(owner, client, mirror_mode):
def test_proxylist_mirror_channel(owner, client, mirror_mode, main_pkgstore_redirect):
response = client.get("/api/dummylogin/bartosz")
assert response.status_code == 200

Expand Down