Skip to content

Commit

Permalink
Fix: number of redirects
Browse files Browse the repository at this point in the history
  • Loading branch information
AntoniaBK committed Sep 3, 2024
1 parent 063e323 commit 09ec3b7
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 9 deletions.
4 changes: 3 additions & 1 deletion feature_extractor/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,15 @@ def redirect_features(self):
"""Gets information about redirects to other pages
Adds the following features to observation:
- 'ip'
- 'number_redirects'
- 'different_final_domain'
"""
first_hostname = get_hostname(self.url)
last_hostname = get_hostname(self.capture.get_last_redirect())

self.observation['ip'] = if_exists(self.capture.get_ips(), last_hostname, None) # CDN gewichtung warninglist
self.observation['number_redirects'] = len(self.capture.get_redirects())
#self.observation['number_redirects'] = len(self.capture.get_redirects())
self.observation['different_final_domain'] = False if first_hostname == last_hostname else True

def link_features(self):
Expand Down Expand Up @@ -251,6 +252,7 @@ def har_features(self):
- "third_party_html_content_ratio"
- "initial_response_size"
- "initial_response_ratio"
- 'number_redirects'
"""
har = self.capture.get_har()
har_extractor = HARFeaturesExtractor(har)
Expand Down
7 changes: 6 additions & 1 deletion feature_extractor/har_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@ def extract_features(self):
total_html_content = 0
third_party_html_content = 0
initial_response_size = 0
number_redirects = 0

for entry in self.har_parser.pages[0].entries: # type: ignore
status = entry['response']['status']
if status in [301, 302, 303, 307, 308]:
number_redirects += 1
url = entry.request.url
response_size = entry.response.bodySize
content_type = entry.response.mimeType
Expand Down Expand Up @@ -52,7 +56,8 @@ def extract_features(self):
"third_party_data_ratio": third_party_data_ratio,
"third_party_html_content_ratio": third_party_html_content_ratio,
"initial_response_size": initial_response_size,
"initial_response_ratio": initial_response_ratio
"initial_response_ratio": initial_response_ratio,
"number_redirects": number_redirects
}

return features
17 changes: 10 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,22 @@ def make_observation(path:str, tags:list[str] = []) -> dict[str, Any]|None:
return features

def remove_useless_tags(l:list[str])-> list[str]:
useless_tags = ["captures", "data", "tests", "Testcaptures"]
useless_tags = ["captures", "data", "tests", "additional_captures"]
for t in useless_tags:
if t in l:
l.remove(t)
return l
return list(set(l))

def main():
capture_dir = 'data/captures/Testcaptures'
#capture_dir = "tests/captures"
capture_dir = 'data/captures/additional_captures'
#capture_dir = 'data/captures/selected_captures'
#capture_dir = 'data/captures/ovh'
all_rows = {}
capture_dir = "tests/captures"
name = os.path.basename(capture_dir)
tags = []
json_file = 'data/output/testdata.json'
csv_file = 'data/output/testdata.csv'
json_file = f'data/output/data_{name}_duplicates.json'
csv_file = f'data/output/data_{name}_duplicates.csv'
structural_hash_count = {}

# Test if the extraction works and get fieldnames
Expand Down Expand Up @@ -74,7 +77,7 @@ def main():
structural_hash_count[hash] += 1
else:
structural_hash_count[hash] = 1
if structural_hash_count[hash] < 2:
if structural_hash_count[hash] > 1:
writer.writerow(observation)
all_rows[observation['uuid']] = observation
else:
Expand Down

0 comments on commit 09ec3b7

Please sign in to comment.