Merge branch 'InternetHealthReport:main' into RovDetection#83

MAVRICK-1 · Feb 7, 2024 · da50d45 · da50d45
2 parents 5547462 + c9ce0ed
commit da50d45
Show file tree

Hide file tree

Showing 17 changed files with 396 additions and 113 deletions.
diff --git a/config.json.example b/config.json.example
@@ -63,26 +63,26 @@
             "iyp.crawlers.peeringdb.ix",
             "iyp.crawlers.cloudflare.top100",
             "iyp.crawlers.tranco.top1M",
-            "iyp.crawlers.cisco.umbrella_top1M",
             "iyp.crawlers.openintel.tranco1m",
             "iyp.crawlers.openintel.umbrella1m",
             "iyp.crawlers.openintel.infra_ns",
-            "iyp.crawlers.openintel.infra_mx",
+            "iyp.crawlers.cisco.umbrella_top1M",
             "iyp.crawlers.citizenlab.urldb",
             "iyp.crawlers.inetintel.as_org",
             "iyp.crawlers.pch.daily_routing_snapshots_v4",
             "iyp.crawlers.pch.daily_routing_snapshots_v6",
             "iyp.crawlers.emileaben.as_names",
             "iyp.crawlers.ripe.atlas_probes",
             "iyp.crawlers.ripe.atlas_measurements",
-            "iyp.crawlers.iana.root_zone",
             "iyp.crawlers.alice_lg.amsix",
             "iyp.crawlers.alice_lg.bcix",
             "iyp.crawlers.alice_lg.decix",
             "iyp.crawlers.alice_lg.ixbr",
             "iyp.crawlers.alice_lg.linx",
             "iyp.crawlers.alice_lg.megaport",
             "iyp.crawlers.alice_lg.netnod",
+            "iyp.crawlers.openintel.dns_dependency_nl",
+            "iyp.crawlers.openintel.dns_dependency_jp",
             "iyp.crawlers.cloudflare.dns_top_locations",
             "iyp.crawlers.cloudflare.dns_top_ases"
         ],
@@ -91,8 +91,7 @@
             "iyp.post.ip2prefix",
             "iyp.post.address_family",
             "iyp.post.country_information",
-            "iyp.post.dns_hierarchy",
-            "iyp.post.url2domain"
+            "iyp.post.url2hostname"
         ]
     }
 }
diff --git a/iyp/crawlers/cisco/README.md b/iyp/crawlers/cisco/README.md
@@ -1,17 +1,19 @@
 # Cisco Umbrella -- https://umbrella-static.s3-us-west-1.amazonaws.com/index.html
 
-The popularity list contains most queried domains based on passive DNS usage across the Umbrella global network.
+The popularity list contains most queried domains (ranging from TLDs to FQDNs)
+based on passive DNS usage across the Umbrella global network.
 
-IYP uses this data to create and annotate DomainName nodes.
+IYP uses this data to create and annotate DomainName and HostName nodes.
 
 ## Graph representation
 
 The rank of the domain is indicated by the `rank` property of the relationship.
 
 ```Cypher
 (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
+(:HostName {name: 'www.google.com'})-[:RANK {rank: 8}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
 ```
 
 ## Dependence
 
-This crawler is not depending on other crawlers.
+This crawler depends on `openintel.umbrella1m`.
diff --git a/iyp/crawlers/cisco/umbrella_top1M.py b/iyp/crawlers/cisco/umbrella_top1M.py
@@ -6,6 +6,7 @@
 from zipfile import ZipFile
 
 import requests
+import tldextract
 
 from iyp import BaseCrawler, RequestStatusError
 
@@ -22,31 +23,75 @@ def run(self):
 
         self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'})
 
-        sys.stderr.write('Downloading latest list...\n')
+        logging.info('Downloading latest list...')
         req = requests.get(URL)
         if req.status_code != 200:
             raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file')
 
         links = []
-        domains = set()
         # open zip file and read top list
         with ZipFile(io.BytesIO(req.content)) as z:
-            with z.open('top-1m.csv') as list:
-                for i, row in enumerate(io.TextIOWrapper(list)):
+            with z.open('top-1m.csv') as top_list:
+                for i, row in enumerate(io.TextIOWrapper(top_list)):
                     row = row.rstrip()
                     rank, domain = row.split(',')
 
-                    domains.add(domain)
                     links.append({'src_name': domain, 'dst_id': self.cisco_qid,
                                   'props': [self.reference, {'rank': int(rank)}]})
 
-        name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains)
+        logging.info('Fetching DomainName/HostName nodes...')
+        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name')
+        host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name')
 
+        # Umbrella mixes up domain and host names.
+        # By order of preferences we rank:
+        # 1) existing domain name
+        # 2) existing host name
+        # 3) do our best to figure out if it is a domain or host and create the
+        # corresponding node
+
+        new_domain_names = set()
+        new_host_names = set()
+        unprocessed_links = list()
+        processed_links = list()
+
+        logging.info('Building relationships...')
         for link in links:
-            link['src_id'] = name_id[link['src_name']]
+            if link['src_name'] in domain_id:
+                link['src_id'] = domain_id[link['src_name']]
+                processed_links.append(link)
+            elif link['src_name'] in host_id:
+                link['src_id'] = host_id[link['src_name']]
+                processed_links.append(link)
+            else:
+                unprocessed_links.append(link)
+                ranked_thing = tldextract.extract(link['src_name'])
+                name = link['src_name']
+                if name == ranked_thing.registered_domain:
+                    new_domain_names.add(name)
+                else:
+                    new_host_names.add(name)
+
+        if new_domain_names:
+            logging.info(f'Pushing {len(new_domain_names)} additional DomainName nodes...')
+            domain_id.update(self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', new_domain_names, all=False))
+        if new_host_names:
+            logging.info(f'Pushing {len(new_host_names)} additional HostName nodes...')
+            host_id.update(self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', new_host_names, all=False))
+
+        for link in unprocessed_links:
+            if link['src_name'] in domain_id:
+                link['src_id'] = domain_id[link['src_name']]
+            elif link['src_name'] in host_id:
+                link['src_id'] = host_id[link['src_name']]
+            else:
+                logging.error(f'Missing DomainName/HostName node for name "{link["src_name"]}". Should not happen.')
+                continue
+            processed_links.append(link)
 
         # Push all links to IYP
-        self.iyp.batch_add_links('RANK', links)
+        logging.info(f'Pushing {len(processed_links)} RANK relationships...')
+        self.iyp.batch_add_links('RANK', processed_links)
 
 
 def main() -> None:

diff --git a/iyp/crawlers/cloudflare/README.md b/iyp/crawlers/cloudflare/README.md
@@ -1,4 +1,4 @@
-# Cloudflare Radar -- https://radar.cloudflare.com/ 
+# Cloudflare Radar -- https://radar.cloudflare.com/
 
 Cloudflare uses aggregated and anonymized DNS queries to their `1.1.1.1` public resolver service to
 provide various datasets, including:
@@ -17,8 +17,12 @@ provide various datasets, including:
 - [Top 100 ASes querying each of the 10,000 highest ranked domain
   names](https://developers.cloudflare.com/api/operations/radar_get__top_ases): Same as above, but
   fetch AS numbers instead.
-  
+
 All rankings are based on one week of data.
+Cloudflare radar's top location and ASes is available for both domain names
+and host names. Results are likely accounting for all NS, A, AAAA queries made to
+Cloudflare's resolver. Since NS queries for host names make no sense IYP links these
+results to `DomainName` nodes.
 
 ## Graph representation
 

diff --git a/iyp/crawlers/cloudflare/dns_top_ases.py b/iyp/crawlers/cloudflare/dns_top_ases.py
@@ -1,3 +1,8 @@
+# Cloudflare radar's top location and ASes is available for both domain names
+# and host names. Results are likely accounting for all NS, A, AAAA queries made to
+# Cloudflare's resolver. Since NS queries for host names make no sense it seems
+# more intuitive to link these results to DomainName nodes.
+
 import argparse
 import logging
 import os

diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py
@@ -1,3 +1,8 @@
+# Cloudflare radar's top location and ASes is available for both domain names
+# and host names. Results are likely accounting for all NS, A, AAAA queries made to
+# Cloudflare's resolver. Since NS queries for host names make no sense it seems
+# more intuitive to link these results to DomainName nodes.
+
 import argparse
 import glob
 import json

diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py
@@ -24,6 +24,9 @@
 class Crawler(BaseCrawler):
     # Base Crawler provides access to IYP via self.iyp and setup a dictionary with the
     # org/url/today's date in self.reference
+    #
+    # Cloudflare ranks second and third level domain names (not host names).
+    # See https://blog.cloudflare.com/radar-domain-rankings/
 
     def run(self):
         """Fetch data and push to IYP."""

diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py
@@ -21,6 +21,9 @@
 class Crawler(BaseCrawler):
     # Base Crawler provides access to IYP via self.iyp
     # and setup a dictionary with the org/url/today's date in self.reference
+    #
+    # Cloudflare ranks second and third level domain names (not host names).
+    # See https://blog.cloudflare.com/radar-domain-rankings/
 
     def run(self):
         """Fetch data and push to IYP."""

diff --git a/iyp/crawlers/openintel/README.md b/iyp/crawlers/openintel/README.md
@@ -4,26 +4,34 @@ The OpenINTEL measurement platform captures daily snapshots of the state of larg
 global Domain Name System (DNS) by running a number of forward and reverse DNS measurements.
 
 While OpenINTEL runs measurements to a variety of domain names, IYP currently only fetches data for
-the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella 
+the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella
 top 1 million list since it combines rankings.
 IYP also get the list of authoritative names servers seen by OpenINTEL.
 
-IYP uses only `A` queries to add IP resolution for DomainName and AuthoritativeNameServer nodes.
-
 A crawler of mail servers is also implemented but not used as it creates a very large number
 of links and this dataset is currently not requested/needed by anyone.
 
 ## Graph representation
 
-IP resolution for  popular domain names:
+IP resolution for  popular host names:
+
 ```Cypher
-(:DomainName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
+(:HostName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
 ```
 
 IP resolution of authoritative name servers:
+
+```Cypher
+(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
+(:IP {ip: '216.239.32.10'})-[:SERVE]->(:Service {name: 'DNS'})
+```
+
+Domain names managed by name servers:
+
 ```Cypher
-(:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
+(:DomainName {name: 'google.com'})-[:MANAGED_BY]->(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})
 ```
+
 ## Dependence
 
 This crawler is not depending on other crawlers.