Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add sitemap index generator #1

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ manifestseption/manifests*
manifestseption/search-manifests*
manifestseption/sites-temp*
snooty/docs-master.zip
sitemap-index/sitemap-index.xml
1 change: 1 addition & 0 deletions sitemap-index/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11.2
162 changes: 162 additions & 0 deletions sitemap-index/sitemap-index-generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import pymongo
import pandas as pd
import os
from flutter import check_type, checked
from dataclasses import dataclass
from typing import Optional

# TODO: replace this with a flag in the DB like excludeFromSitemapIndex or the like
excluded_repos = ["docs-404", "docs-meta", "devhub-content", "docs-mongodb-internal",
"docs-mongodb-internal-base", "docs-csfle-merge", "docs-k8s-operator",
"docs-php-library", "docs-ruby", "docs-mongoid", "mms-docs"]

@checked
@dataclass
class SitemapUrlSuffix():
schmalliso marked this conversation as resolved.
Show resolved Hide resolved
gitBranchName: str
urlSuffix: str
extension: str

@checked
@dataclass
class Branch():
gitBranchName: str
active: bool
publishOriginalBranchName: bool
urlSlug: Optional[str]
buildsWithSnooty: bool

@checked
@dataclass
class Repo():
repoName: str
branches: list[Branch] | None
prefix: str
baseUrl: str

class ConstructRepo:
def __init__(self, data) -> None:
self.data = data
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so the way you actually want to use check_type() is like this:

self.data = check_type(Repo, data)

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't data here not a Repo though? Like the data data has a bunch of different fields from my defined Repo class and the final export from this is the Repo object?


self.repoName: str = data["repoName"]
self.branches = self.get_branches()
self.prefix = self.get_prefix()
self.baseUrl = self.derive_url()

def get_prefix(self) -> str:
if not check_type(str, self.data["prefix"]["dotcomprd"]):
raise TypeError
return self.data["prefix"]["dotcomprd"]

def derive_url(self) -> str:
url = "https://www.mongodb.com/" + self.prefix + "/"
return url

def get_branches(self) -> list[Branch] | None:
if not self.data["branches"]:
self.wonky = True
return None
branch_list: list[Branch] = []
for branch in self.data["branches"]:
new_branch = Branch(branch["gitBranchName"],
branch.get("active", False),
branch.get("publishOriginalBranchName", False),
branch.get("urlSlug", None),
branch.get("buildsWithSnooty", True))
branch_list.append(new_branch)
return branch_list

def export(self) -> Repo:
repo = Repo(
repoName=self.repoName,
branches=self.branches,
prefix=self.prefix,
baseUrl=self.baseUrl
)
return repo


class ConstructSitemapEntry:
def __init__(self, data: Branch) -> None:
self.data = data

self.gitBranchName: str = data.gitBranchName
self.urlSuffix = self.derive_url_suffix()
self.extension = self.derive_extension()

def derive_extension(self) -> str:
if self.data.buildsWithSnooty:
return "/sitemap-0.xml"
return "/sitemap.xml.gz"

def derive_url_suffix(self) -> str:
urlSuffix: str = ""
if self.data.urlSlug:
urlSuffix = self.data.urlSlug
return urlSuffix
if self.data.publishOriginalBranchName:
urlSuffix = self.gitBranchName
return urlSuffix
return urlSuffix

def export(self) -> SitemapUrlSuffix:
suffix = SitemapUrlSuffix(
gitBranchName=self.gitBranchName,
urlSuffix=self.urlSuffix,
extension=self.extension
)
return suffix

def run_validation(data) -> tuple[bool, str]:
valid = True
if not check_type(str, data["repoName"]):
valid = False
return valid, "No repo name?!"
schmalliso marked this conversation as resolved.
Show resolved Hide resolved
if not data.get("branches"):
valid = False
return valid, "No branch entry"
if not (data.get("prefix") and data["prefix"].get("dotcomprd")):
valid = False
return valid, "No dotcomprd prefix entry"
return valid, ""

repos_branches = pymongo.MongoClient(os.environ.get('SNOOTY_CONN_STRING'))["pool"].repos_branches
schmalliso marked this conversation as resolved.
Show resolved Hide resolved

repos_branches_data = repos_branches.find()
sitemap_urls: list[str] = []

for r in repos_branches_data:
print(r["repoName"])
validity, message = run_validation(r)
if not validity:
print(message)
continue
# Skip repos that do not need sitemaps or whose sitemaps are horribly broken because built by legacy tooling
if r["repoName"] in excluded_repos:
print("Skipping")
continue
repo = ConstructRepo(r).export()

if repo.branches:
for b in repo.branches:
if b.active:
print(b.gitBranchName)
sitemap_suffix: SitemapUrlSuffix = ConstructSitemapEntry(b).export()
schmalliso marked this conversation as resolved.
Show resolved Hide resolved
sitemap_url: str = repo.baseUrl + sitemap_suffix.urlSuffix + sitemap_suffix.extension
schmalliso marked this conversation as resolved.
Show resolved Hide resolved
print(sitemap_url)
sitemap_urls.append(sitemap_url)
else:
print("Repo has no branches.")

print(sitemap_urls)

# Set up DataFrame from the list of URLs

df = pd.DataFrame(sitemap_urls, columns=["loc"])

xml_data = df.to_xml(root_name="sitemapindex", row_name="sitemap", xml_declaration=True)
print(xml_data)

# Save the XML data to a file
with open("sitemap-index.xml", "w") as file:
file.write(xml_data)