-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreplace_urls_absolute.py
133 lines (109 loc) · 4.99 KB
/
replace_urls_absolute.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import subprocess
from bs4 import BeautifulSoup
import re
import os;
from urllib.parse import urlparse
ELIPSIS_REGEX = r"(?:\.\./)+"
def main():
for root, dirs, files in os.walk("./SSW.Website.WebUI"):
for file in files:
if not (file.endswith(".aspx") or file.endswith(".htm") or file.endswith(".html")):
# print(f"file is invalid type {file}")
continue
if file.startswith("zz") or file.startswith("zr"):
# print("file is removed")
continue
file = file.replace("za", "",1 )
# print(f"{root}\\{file}")
path = path_to_html_equivalent(f"{root}\\{file}")
path_exists = os.path.exists(path)
if not os.path.exists(path):
# print(f"WARNING: no archived file found at {path}")
continue
with open(path, "r+", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html5lib")
hrefs = soup.findAll("a", href=True)
if hrefs is None:
# print("no hrefs found")
continue
for href in hrefs:
link = href['href']
if link == "":
continue
elipses = re.findall(r"(?:\.\./)+",link)
url_points_upwards = len(elipses) > 0
if url_points_upwards:
print(f"url points upward {file}")
link = f"/ssw/{re.sub(ELIPSIS_REGEX,folder_of_upward_traversal(elipses[0], root,),link)}"
href["href"] = link
continue
domain = urlparse(link).netloc
link_hosted_ssw = "ssw.com.au" in domain or link.startswith("/ssw")
if not link_hosted_ssw and not link.startswith("http") and not link.startswith(" http") and not link.startswith("#") and not link.startswith("mailto") and not link.startswith("javascript") and not link.startswith("tel:") and not link.endswith(".pdf") and not link.endswith(".txt") and not link.endswith(".pdf") and not link.endswith(".png") and not link.startswith("mailto") and not link.startswith("javascript") and not link.startswith("tel:") and not link.endswith(".pdf") and not link.endswith(".txt") and not link.endswith(".pdf") and not link.endswith(".jpg") and not link.startswith("\\ssw"):
if not link.startswith("/") and not link.startswith("./"):
new_dir = root.replace("\\", "/")
new_dir = new_dir.replace("./SSW.Website.WebUI", "/ssw")
href["href"] = f'{new_dir}/{link}'
f.seek(0)
f.truncate()
# print("writing changes")
f.write(str(soup.prettify(formatter="html5")))
# if path_exists:
# subprocess.run(["npx", "prettier", "--write", path], check=True)
def folder_of_upward_traversal(path: str, current_directory: str):
elipses = re.findall(r"(?:\.\./)+",path)
path_to_physical_folder = os.path.join(current_directory, elipses[0])
print("path to physical folder")
print(path_to_physical_folder)
path_to_file = os.path.relpath(path_to_physical_folder)
print("path_to_file")
print(path_to_file)
path_to_file = path_to_file.replace("SSW.Website.WebUI", "")
if(not path_to_file.endswith("\\")):
path_to_file += "\\"
if(path_to_file.startswith("\\")):
path_to_file = path_to_file[1:]
return path_to_file.replace("\\", "/")
def path_to_html_equivalent(path : str) -> str:
path = path.replace("SSW.Website.WebUI", "archive")
path = path.replace("aspx", "html")
path = pascal_to_kebab(path)
return path
def pascal_to_kebab(s: str) -> str:
if s.endswith("demo-fms200203"):
return s
# Convert PascalCase to kebab-case
regex = r"([a-z])([A-Z0-9])"
replacements = {
"About Me" : "about-me",
"Chirag Modi": "chirag-modi",
"Frand Jajo": "frand-jajo",
"Jack Lin": "jack-lin",
"PCAnywhere" : "pc-anywhere",
"ThankyouShare": "thank-you-share",
"demo-fms200203" : "demo-fms200203",
"RulestoBetterCommandLines": "rules-to-better-command-lines",
"eXtremeEmails": "extreme-emails",
"NET": "Net",
"NetUG": "Netug",
"SQL": "Sql",
"BI": "Bi",
"ALM": "Alm",
"SSW": "Ssw",
"iOS": "Ios",
"SignalR": "Signalr",
"AngularJS": "Angularjs",
"HoloLens": "Hololens",
"UTS": "Uts",
"DevOps": "Devops",
"SharePoint": "Sharepoint",
"_": "-",
}
for old, new in replacements.items():
s = s.replace(old, new)
s = re.sub(r"and([A-Z0-9])", r"And\1", s)
s = re.sub(r"to([A-Z0-9])", r"To\1", s)
# add dashes between words and numbers
s = re.sub(r"(\d+)([A-Z])", r"\1-\2", s)
return re.sub(regex, r"\1-\2", s).lower()
main()