From 09ec3b7c7315a257f7ddd45f3546a0184d11f1c9 Mon Sep 17 00:00:00 2001
From: Antonia Koch <antonia@cookfirst.de>
Date: Tue, 3 Sep 2024 23:04:47 +0200
Subject: [PATCH] Fix: number of redirects

---
 feature_extractor/feature_extractor.py |  4 +++-
 feature_extractor/har_features.py      |  7 ++++++-
 main.py                                | 17 ++++++++++-------
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/feature_extractor/feature_extractor.py b/feature_extractor/feature_extractor.py
index ef28e62..b1f70ad 100644
--- a/feature_extractor/feature_extractor.py
+++ b/feature_extractor/feature_extractor.py
@@ -191,6 +191,7 @@ def redirect_features(self):
         """Gets information about redirects to other pages
 
         Adds the following features to observation:
+        - 'ip'
         - 'number_redirects'
         - 'different_final_domain'
         """
@@ -198,7 +199,7 @@ def redirect_features(self):
         last_hostname = get_hostname(self.capture.get_last_redirect())
 
         self.observation['ip'] = if_exists(self.capture.get_ips(), last_hostname, None) # CDN gewichtung warninglist
-        self.observation['number_redirects'] = len(self.capture.get_redirects())
+        #self.observation['number_redirects'] = len(self.capture.get_redirects())
         self.observation['different_final_domain'] = False if first_hostname == last_hostname else True
 
     def link_features(self):
@@ -251,6 +252,7 @@ def har_features(self):
         - "third_party_html_content_ratio"
         - "initial_response_size"
         - "initial_response_ratio"
+        - 'number_redirects'
         """
         har = self.capture.get_har()
         har_extractor = HARFeaturesExtractor(har)
diff --git a/feature_extractor/har_features.py b/feature_extractor/har_features.py
index 0c962f0..fedb1b4 100644
--- a/feature_extractor/har_features.py
+++ b/feature_extractor/har_features.py
@@ -21,8 +21,12 @@ def extract_features(self):
         total_html_content = 0
         third_party_html_content = 0
         initial_response_size = 0
+        number_redirects = 0
 
         for entry in self.har_parser.pages[0].entries: # type: ignore
+            status = entry['response']['status']
+            if status in [301, 302, 303, 307, 308]:
+                number_redirects += 1
             url = entry.request.url
             response_size = entry.response.bodySize
             content_type = entry.response.mimeType
@@ -52,7 +56,8 @@ def extract_features(self):
             "third_party_data_ratio": third_party_data_ratio,
             "third_party_html_content_ratio": third_party_html_content_ratio,
             "initial_response_size": initial_response_size,
-            "initial_response_ratio": initial_response_ratio 
+            "initial_response_ratio": initial_response_ratio, 
+            "number_redirects": number_redirects
         }
 
         return features
diff --git a/main.py b/main.py
index d0a1c17..ab7d652 100644
--- a/main.py
+++ b/main.py
@@ -22,19 +22,22 @@ def make_observation(path:str, tags:list[str] = []) -> dict[str, Any]|None:
         return features
     
 def remove_useless_tags(l:list[str])-> list[str]:
-    useless_tags = ["captures", "data", "tests", "Testcaptures"]
+    useless_tags = ["captures", "data", "tests", "additional_captures"]
     for t in useless_tags:
         if t in l:
             l.remove(t)
-    return l
+    return list(set(l))
 
 def main():
-    capture_dir = 'data/captures/Testcaptures'
+    #capture_dir = "tests/captures"
+    capture_dir = 'data/captures/additional_captures'
+    #capture_dir = 'data/captures/selected_captures'
+    #capture_dir = 'data/captures/ovh'
     all_rows = {}
-    capture_dir = "tests/captures"
+    name = os.path.basename(capture_dir)
     tags = []
-    json_file = 'data/output/testdata.json'
-    csv_file = 'data/output/testdata.csv'
+    json_file = f'data/output/data_{name}_duplicates.json'
+    csv_file = f'data/output/data_{name}_duplicates.csv'
     structural_hash_count = {}
     
     # Test if the extraction works and get fieldnames
@@ -74,7 +77,7 @@ def main():
                                 structural_hash_count[hash] += 1
                             else:
                                 structural_hash_count[hash] = 1
-                            if structural_hash_count[hash] < 2:
+                            if structural_hash_count[hash] > 1:
                                 writer.writerow(observation)
                                 all_rows[observation['uuid']] = observation
                             else: