From 09ec3b7c7315a257f7ddd45f3546a0184d11f1c9 Mon Sep 17 00:00:00 2001 From: Antonia Koch Date: Tue, 3 Sep 2024 23:04:47 +0200 Subject: [PATCH] Fix: number of redirects --- feature_extractor/feature_extractor.py | 4 +++- feature_extractor/har_features.py | 7 ++++++- main.py | 17 ++++++++++------- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/feature_extractor/feature_extractor.py b/feature_extractor/feature_extractor.py index ef28e62..b1f70ad 100644 --- a/feature_extractor/feature_extractor.py +++ b/feature_extractor/feature_extractor.py @@ -191,6 +191,7 @@ def redirect_features(self): """Gets information about redirects to other pages Adds the following features to observation: + - 'ip' - 'number_redirects' - 'different_final_domain' """ @@ -198,7 +199,7 @@ def redirect_features(self): last_hostname = get_hostname(self.capture.get_last_redirect()) self.observation['ip'] = if_exists(self.capture.get_ips(), last_hostname, None) # CDN gewichtung warninglist - self.observation['number_redirects'] = len(self.capture.get_redirects()) + #self.observation['number_redirects'] = len(self.capture.get_redirects()) self.observation['different_final_domain'] = False if first_hostname == last_hostname else True def link_features(self): @@ -251,6 +252,7 @@ def har_features(self): - "third_party_html_content_ratio" - "initial_response_size" - "initial_response_ratio" + - 'number_redirects' """ har = self.capture.get_har() har_extractor = HARFeaturesExtractor(har) diff --git a/feature_extractor/har_features.py b/feature_extractor/har_features.py index 0c962f0..fedb1b4 100644 --- a/feature_extractor/har_features.py +++ b/feature_extractor/har_features.py @@ -21,8 +21,12 @@ def extract_features(self): total_html_content = 0 third_party_html_content = 0 initial_response_size = 0 + number_redirects = 0 for entry in self.har_parser.pages[0].entries: # type: ignore + status = entry['response']['status'] + if status in [301, 302, 303, 307, 308]: + number_redirects += 1 url = entry.request.url response_size = entry.response.bodySize content_type = entry.response.mimeType @@ -52,7 +56,8 @@ def extract_features(self): "third_party_data_ratio": third_party_data_ratio, "third_party_html_content_ratio": third_party_html_content_ratio, "initial_response_size": initial_response_size, - "initial_response_ratio": initial_response_ratio + "initial_response_ratio": initial_response_ratio, + "number_redirects": number_redirects } return features diff --git a/main.py b/main.py index d0a1c17..ab7d652 100644 --- a/main.py +++ b/main.py @@ -22,19 +22,22 @@ def make_observation(path:str, tags:list[str] = []) -> dict[str, Any]|None: return features def remove_useless_tags(l:list[str])-> list[str]: - useless_tags = ["captures", "data", "tests", "Testcaptures"] + useless_tags = ["captures", "data", "tests", "additional_captures"] for t in useless_tags: if t in l: l.remove(t) - return l + return list(set(l)) def main(): - capture_dir = 'data/captures/Testcaptures' + #capture_dir = "tests/captures" + capture_dir = 'data/captures/additional_captures' + #capture_dir = 'data/captures/selected_captures' + #capture_dir = 'data/captures/ovh' all_rows = {} - capture_dir = "tests/captures" + name = os.path.basename(capture_dir) tags = [] - json_file = 'data/output/testdata.json' - csv_file = 'data/output/testdata.csv' + json_file = f'data/output/data_{name}_duplicates.json' + csv_file = f'data/output/data_{name}_duplicates.csv' structural_hash_count = {} # Test if the extraction works and get fieldnames @@ -74,7 +77,7 @@ def main(): structural_hash_count[hash] += 1 else: structural_hash_count[hash] = 1 - if structural_hash_count[hash] < 2: + if structural_hash_count[hash] > 1: writer.writerow(observation) all_rows[observation['uuid']] = observation else: