Skip to content

Commit

Permalink
Merge pull request #280 from serlo/EBZ4j-kulla-2025-01-28-21-45
Browse files Browse the repository at this point in the history
feat: Preparse content + add info prints
  • Loading branch information
kulla authored Jan 28, 2025
2 parents 83aa9c0 + 7499b93 commit 342ec08
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 14 deletions.
Binary file modified cache/current-content.json.gz
Binary file not shown.
8 changes: 6 additions & 2 deletions serlo_api_client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""A client to send requests to the Serlo GraphQL API"""

import json
import os

from typing import Dict, Any, Optional
Expand Down Expand Up @@ -56,8 +57,11 @@ def fetch_current_content(revision_id):
)
result = execute(query, {"id": revision_id})

if content := result.get("uuid", {}).get("content", None):
return content
if content_text := result.get("uuid", {}).get("content", None):
try:
return json.loads(content_text)
except json.JSONDecodeError:
pass
return None


Expand Down
33 changes: 21 additions & 12 deletions update_datenraum_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
from serlo_api_client import fetch_current_content
from utils import has_description, pick

# See https://github.com/serlo/evaluations/blob/main/src/2025/2025-01-28-cache-current-revisions.ipynb
# for the generation of this file
CACHED_CONTENT_FILE = "cache/current-content.json.gz"
MAX_CONTENT_DOWNLOAD_TIME = 30 * 60
MAX_CONTENT_DOWNLOAD_TIME = 20 * 60


def main(metadata_file, nodes_file):
Expand All @@ -38,7 +40,11 @@ def main(metadata_file, nodes_file):
filtered_records = [record for record in records if has_description(record)]

if isinstance(env, PotsdamEnvironment):
records = [record for record in records if record["content"] is not None]
records = [
record
for record in records
if "content" in record and record["content"] is not None
]
else:
records = filtered_records + taxonomies

Expand All @@ -47,11 +53,14 @@ def main(metadata_file, nodes_file):


def add_content_to_records(records):
print("INFO: Load cached content")

with gzip.open(CACHED_CONTENT_FILE, "rt", encoding="utf-8") as gzip_file:
cached_content = json.load(gzip_file)

start_time = current_time()

print("INFO: Start content download")
for record in records:
if (current_time() - start_time) > MAX_CONTENT_DOWNLOAD_TIME:
print("INFO: Stop content download due to time limit")
Expand All @@ -64,26 +73,26 @@ def add_content_to_records(records):
continue

if current_revision_id in cached_content:
content_text = cached_content[current_revision_id]
content = cached_content[current_revision_id]
else:
print(f"INFO: Download content for {current_revision_id}")

content_text = fetch_current_content(current_revision_id)
cached_content[current_revision_id] = content_text
content = fetch_current_content(current_revision_id)
cached_content[current_revision_id] = content

# Do not hammer the API
time.sleep(0.5)
time.sleep(0.1)

if content is not None:
record["content"] = content

if isinstance(content_text, str):
try:
content = json.loads(content_text)
record["content"] = content
except json.JSONDecodeError:
continue
print("INFO: Save cached content")

with gzip.open(CACHED_CONTENT_FILE, "wt", encoding="utf-8") as gzip_file:
json.dump(cached_content, gzip_file)

print("INFO: Finish content download")

return records


Expand Down

0 comments on commit 342ec08

Please sign in to comment.