From 838c8bed6936fa473d6510bb18c9f89f48ab177b Mon Sep 17 00:00:00 2001
From: Romain <romain-fontugne@users.noreply.github.com>
Date: Wed, 7 Feb 2024 16:55:53 +0900
Subject: [PATCH 1/2] DNS remodeling (#119)

* update url2domain to url2hostname

* remove iana root zone file and dns hierarchy from config file

* Atlas measurement targets are now hostnames

* update openintel crawlers to the new DNS model

* umbrella now ranks a mix of DomainName and HostName nodes and should be run after openintel.umbrella1m

* Add explanation for cloudflare DNS modeling

* lower umbrella crawler in config file

* update READMEs with the new DNS modeling

* add (:Service {name:'DNS'}) node and link it to authoritative name servers

* Nodes do not have reference properties

* Normalize IPv6 addresses

* Fix wrong crawler name

* Typos and formatting

* Remove infra_mx crawler since it does not do anything at the moment

* Update Cisco Umbrella crawler

- Batch create new nodes (happens more often than expected)
- Add logging output
- Do not use builtins as variable names

* Remove redundant set and parameters

* Remove Service node for now

We could not decide on a name, so we will deal with this later.

---------

Co-authored-by: Malte Tashiro <malte@iij.ad.jp>
---
 config.json.example                          |  7 +-
 iyp/crawlers/cisco/README.md                 |  8 +-
 iyp/crawlers/cisco/umbrella_top1M.py         | 61 +++++++++++--
 iyp/crawlers/cloudflare/README.md            |  8 +-
 iyp/crawlers/cloudflare/dns_top_ases.py      |  5 ++
 iyp/crawlers/cloudflare/dns_top_locations.py |  5 ++
 iyp/crawlers/cloudflare/ranking_bucket.py    |  3 +
 iyp/crawlers/cloudflare/top100.py            |  3 +
 iyp/crawlers/openintel/README.md             | 20 +++--
 iyp/crawlers/openintel/__init__.py           | 95 ++++++++++++--------
 iyp/crawlers/openintel/infra_mx.py           |  6 ++
 iyp/crawlers/openintel/infra_ns.py           |  3 +-
 iyp/crawlers/ripe/README.md                  |  4 +-
 iyp/crawlers/ripe/atlas_measurements.py      | 26 +++---
 iyp/post/{url2domain.py => url2hostname.py}  | 18 ++--
 15 files changed, 183 insertions(+), 89 deletions(-)
 rename iyp/post/{url2domain.py => url2hostname.py} (68%)

diff --git a/config.json.example b/config.json.example
index abace94..81134ca 100644
--- a/config.json.example
+++ b/config.json.example
@@ -63,11 +63,10 @@
             "iyp.crawlers.peeringdb.ix",
             "iyp.crawlers.cloudflare.top100",
             "iyp.crawlers.tranco.top1M",
-            "iyp.crawlers.cisco.umbrella_top1M",
             "iyp.crawlers.openintel.tranco1m",
             "iyp.crawlers.openintel.umbrella1m",
             "iyp.crawlers.openintel.infra_ns",
-            "iyp.crawlers.openintel.infra_mx",
+            "iyp.crawlers.cisco.umbrella_top1M",
             "iyp.crawlers.citizenlab.urldb",
             "iyp.crawlers.inetintel.as_org",
             "iyp.crawlers.pch.daily_routing_snapshots_v4",
@@ -75,7 +74,6 @@
             "iyp.crawlers.emileaben.as_names",
             "iyp.crawlers.ripe.atlas_probes",
             "iyp.crawlers.ripe.atlas_measurements",
-            "iyp.crawlers.iana.root_zone",
             "iyp.crawlers.alice_lg.amsix",
             "iyp.crawlers.alice_lg.bcix",
             "iyp.crawlers.alice_lg.decix",
@@ -91,8 +89,7 @@
             "iyp.post.ip2prefix",
             "iyp.post.address_family",
             "iyp.post.country_information",
-            "iyp.post.dns_hierarchy",
-            "iyp.post.url2domain"
+            "iyp.post.url2hostname"
         ]
     }
 }
diff --git a/iyp/crawlers/cisco/README.md b/iyp/crawlers/cisco/README.md
index 24d1811..839402e 100644
--- a/iyp/crawlers/cisco/README.md
+++ b/iyp/crawlers/cisco/README.md
@@ -1,8 +1,9 @@
 # Cisco Umbrella -- https://umbrella-static.s3-us-west-1.amazonaws.com/index.html
 
-The popularity list contains most queried domains based on passive DNS usage across the Umbrella global network.
+The popularity list contains most queried domains (ranging from TLDs to FQDNs)
+based on passive DNS usage across the Umbrella global network.
 
-IYP uses this data to create and annotate DomainName nodes.
+IYP uses this data to create and annotate DomainName and HostName nodes.
 
 ## Graph representation
 
@@ -10,8 +11,9 @@ The rank of the domain is indicated by the `rank` property of the relationship.
 
 ```Cypher
 (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
+(:HostName {name: 'www.google.com'})-[:RANK {rank: 8}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
 ```
 
 ## Dependence
 
-This crawler is not depending on other crawlers.
+This crawler depends on `openintel.umbrella1m`.
diff --git a/iyp/crawlers/cisco/umbrella_top1M.py b/iyp/crawlers/cisco/umbrella_top1M.py
index 629f15b..714681b 100644
--- a/iyp/crawlers/cisco/umbrella_top1M.py
+++ b/iyp/crawlers/cisco/umbrella_top1M.py
@@ -6,6 +6,7 @@
 from zipfile import ZipFile
 
 import requests
+import tldextract
 
 from iyp import BaseCrawler, RequestStatusError
 
@@ -22,31 +23,75 @@ def run(self):
 
         self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'})
 
-        sys.stderr.write('Downloading latest list...\n')
+        logging.info('Downloading latest list...')
         req = requests.get(URL)
         if req.status_code != 200:
             raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file')
 
         links = []
-        domains = set()
         # open zip file and read top list
         with ZipFile(io.BytesIO(req.content)) as z:
-            with z.open('top-1m.csv') as list:
-                for i, row in enumerate(io.TextIOWrapper(list)):
+            with z.open('top-1m.csv') as top_list:
+                for i, row in enumerate(io.TextIOWrapper(top_list)):
                     row = row.rstrip()
                     rank, domain = row.split(',')
 
-                    domains.add(domain)
                     links.append({'src_name': domain, 'dst_id': self.cisco_qid,
                                   'props': [self.reference, {'rank': int(rank)}]})
 
-        name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains)
+        logging.info('Fetching DomainName/HostName nodes...')
+        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name')
+        host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name')
 
+        # Umbrella mixes up domain and host names.
+        # By order of preferences we rank:
+        # 1) existing domain name
+        # 2) existing host name
+        # 3) do our best to figure out if it is a domain or host and create the
+        # corresponding node
+
+        new_domain_names = set()
+        new_host_names = set()
+        unprocessed_links = list()
+        processed_links = list()
+
+        logging.info('Building relationships...')
         for link in links:
-            link['src_id'] = name_id[link['src_name']]
+            if link['src_name'] in domain_id:
+                link['src_id'] = domain_id[link['src_name']]
+                processed_links.append(link)
+            elif link['src_name'] in host_id:
+                link['src_id'] = host_id[link['src_name']]
+                processed_links.append(link)
+            else:
+                unprocessed_links.append(link)
+                ranked_thing = tldextract.extract(link['src_name'])
+                name = link['src_name']
+                if name == ranked_thing.registered_domain:
+                    new_domain_names.add(name)
+                else:
+                    new_host_names.add(name)
+
+        if new_domain_names:
+            logging.info(f'Pushing {len(new_domain_names)} additional DomainName nodes...')
+            domain_id.update(self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', new_domain_names, all=False))
+        if new_host_names:
+            logging.info(f'Pushing {len(new_host_names)} additional HostName nodes...')
+            host_id.update(self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', new_host_names, all=False))
+
+        for link in unprocessed_links:
+            if link['src_name'] in domain_id:
+                link['src_id'] = domain_id[link['src_name']]
+            elif link['src_name'] in host_id:
+                link['src_id'] = host_id[link['src_name']]
+            else:
+                logging.error(f'Missing DomainName/HostName node for name "{link["src_name"]}". Should not happen.')
+                continue
+            processed_links.append(link)
 
         # Push all links to IYP
-        self.iyp.batch_add_links('RANK', links)
+        logging.info(f'Pushing {len(processed_links)} RANK relationships...')
+        self.iyp.batch_add_links('RANK', processed_links)
 
 
 def main() -> None:
diff --git a/iyp/crawlers/cloudflare/README.md b/iyp/crawlers/cloudflare/README.md
index 4ba886f..7ce4aee 100644
--- a/iyp/crawlers/cloudflare/README.md
+++ b/iyp/crawlers/cloudflare/README.md
@@ -1,4 +1,4 @@
-# Cloudflare Radar -- https://radar.cloudflare.com/ 
+# Cloudflare Radar -- https://radar.cloudflare.com/
 
 Cloudflare uses aggregated and anonymized DNS queries to their `1.1.1.1` public resolver service to
 provide various datasets, including:
@@ -17,8 +17,12 @@ provide various datasets, including:
 - [Top 100 ASes querying each of the 10,000 highest ranked domain
   names](https://developers.cloudflare.com/api/operations/radar_get__top_ases): Same as above, but
   fetch AS numbers instead.
-  
+
 All rankings are based on one week of data.
+Cloudflare radar's top location and ASes is available for both domain names
+and host names. Results are likely accounting for all NS, A, AAAA queries made to
+Cloudflare's resolver. Since NS queries for host names make no sense IYP links these
+results to `DomainName` nodes.
 
 ## Graph representation
 
diff --git a/iyp/crawlers/cloudflare/dns_top_ases.py b/iyp/crawlers/cloudflare/dns_top_ases.py
index f24c952..8c15ac0 100644
--- a/iyp/crawlers/cloudflare/dns_top_ases.py
+++ b/iyp/crawlers/cloudflare/dns_top_ases.py
@@ -1,3 +1,8 @@
+# Cloudflare radar's top location and ASes is available for both domain names
+# and host names. Results are likely accounting for all NS, A, AAAA queries made to
+# Cloudflare's resolver. Since NS queries for host names make no sense it seems
+# more intuitive to link these results to DomainName nodes.
+
 import argparse
 import logging
 import os
diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py
index 4f4118f..46e46a0 100644
--- a/iyp/crawlers/cloudflare/dns_top_locations.py
+++ b/iyp/crawlers/cloudflare/dns_top_locations.py
@@ -1,3 +1,8 @@
+# Cloudflare radar's top location and ASes is available for both domain names
+# and host names. Results are likely accounting for all NS, A, AAAA queries made to
+# Cloudflare's resolver. Since NS queries for host names make no sense it seems
+# more intuitive to link these results to DomainName nodes.
+
 import argparse
 import glob
 import json
diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py
index 7193ad6..fbea3f0 100644
--- a/iyp/crawlers/cloudflare/ranking_bucket.py
+++ b/iyp/crawlers/cloudflare/ranking_bucket.py
@@ -24,6 +24,9 @@
 class Crawler(BaseCrawler):
     # Base Crawler provides access to IYP via self.iyp and setup a dictionary with the
     # org/url/today's date in self.reference
+    #
+    # Cloudflare ranks second and third level domain names (not host names).
+    # See https://blog.cloudflare.com/radar-domain-rankings/
 
     def run(self):
         """Fetch data and push to IYP."""
diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py
index 5e2f5fb..d189da8 100644
--- a/iyp/crawlers/cloudflare/top100.py
+++ b/iyp/crawlers/cloudflare/top100.py
@@ -21,6 +21,9 @@
 class Crawler(BaseCrawler):
     # Base Crawler provides access to IYP via self.iyp
     # and setup a dictionary with the org/url/today's date in self.reference
+    #
+    # Cloudflare ranks second and third level domain names (not host names).
+    # See https://blog.cloudflare.com/radar-domain-rankings/
 
     def run(self):
         """Fetch data and push to IYP."""
diff --git a/iyp/crawlers/openintel/README.md b/iyp/crawlers/openintel/README.md
index 70cf9c5..0f3ec5f 100644
--- a/iyp/crawlers/openintel/README.md
+++ b/iyp/crawlers/openintel/README.md
@@ -4,26 +4,34 @@ The OpenINTEL measurement platform captures daily snapshots of the state of larg
 global Domain Name System (DNS) by running a number of forward and reverse DNS measurements.
 
 While OpenINTEL runs measurements to a variety of domain names, IYP currently only fetches data for
-the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella 
+the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella
 top 1 million list since it combines rankings.
 IYP also get the list of authoritative names servers seen by OpenINTEL.
 
-IYP uses only `A` queries to add IP resolution for DomainName and AuthoritativeNameServer nodes.
-
 A crawler of mail servers is also implemented but not used as it creates a very large number
 of links and this dataset is currently not requested/needed by anyone.
 
 ## Graph representation
 
-IP resolution for  popular domain names:
+IP resolution for  popular host names:
+
 ```Cypher
-(:DomainName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
+(:HostName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
 ```
 
 IP resolution of authoritative name servers:
+
+```Cypher
+(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
+(:IP {ip: '216.239.32.10'})-[:SERVE]->(:Service {name: 'DNS'})
+```
+
+Domain names managed by name servers:
+
 ```Cypher
-(:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
+(:DomainName {name: 'google.com'})-[:MANAGED_BY]->(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})
 ```
+
 ## Dependence
 
 This crawler is not depending on other crawlers.
diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py
index 75b3c05..d255205 100644
--- a/iyp/crawlers/openintel/__init__.py
+++ b/iyp/crawlers/openintel/__init__.py
@@ -7,6 +7,7 @@
 import logging
 import os
 import tempfile
+from ipaddress import IPv6Address
 
 import arrow
 import boto3
@@ -42,17 +43,11 @@ def valid_date(s):
 
 
 class OpenIntelCrawler(BaseCrawler):
-    def __init__(self, organization, url, name, dataset, additional_domain_type=str()):
+    def __init__(self, organization, url, name, dataset):
         """Initialization of the OpenIntel crawler requires the name of the dataset
-        (e.g. tranco or infra:ns).
-
-        If the dataset contains special types of domain
-        names, an additional label can be specified (e.g., `AuthoritativeNameServer`)
-        that will be attached to the `DomainName` nodes.
-        """
+        (e.g. tranco or infra:ns)."""
 
         self.dataset = dataset
-        self.additional_domain_type = additional_domain_type
         super().__init__(organization, url, name)
 
     def get_parquet(self):
@@ -179,52 +174,74 @@ def run(self):
 
         print(f'Read {len(df)} unique records from {len(self.pandas_df_list)} Parquet file(s).')
 
-        # Only domain names from the `query_name` column that will receive the
-        # additional_domain_type label (if present).
-        query_domain_names = set(df['query_name'])
-        # Name server domain names.
-        ns_domain_names = set(df[df.ns_address.notnull()]['ns_address'])
-        # All domain names, including the ones from the name server column.
-        all_domain_names = query_domain_names.union(ns_domain_names)
-        # Create all DomainName nodes.
-        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', all_domain_names)
-        # Get node IDs for NS nodes and add NS label.
-        ns_id = {name: domain_id[name] for name in ns_domain_names}
+        # query_names for NS records are domain names
+        domain_names = set(df[df.response_type == 'NS']['query_name'])
+
+        # response values of NS records are name servers
+        name_servers = set(df[df.ns_address.notnull()]['ns_address'])
+
+        # query_names for A and AAAA records are host names
+        host_names = set(df[(df.response_type == 'A') | (df.response_type == 'AAAA')]['query_name'])
+
+        ipv6_addresses = set()
+        # Normalize IPv6 addresses.
+        for ip in df[df.ip6_address.notnull()]['ip6_address']:
+            try:
+                ip_normalized = IPv6Address(ip).compressed
+            except ValueError as e:
+                logging.error(f'Ignoring invalid IPv6 address "{ip}": {e}')
+                continue
+            ipv6_addresses.add(ip_normalized)
+
+        # Get/create all nodes:
+        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domain_names)
+        host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', host_names)
+        ns_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', name_servers)
         self.iyp.batch_add_node_label(list(ns_id.values()), 'AuthoritativeNameServer')
-        # Add additional node label if present.
-        additional_id = set()
-        if self.additional_domain_type and self.additional_domain_type != 'DomainName':
-            additional_id = {domain_id[name] for name in query_domain_names}
-            self.iyp.batch_add_node_label(list(additional_id), self.additional_domain_type)
         ip4_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip4_address.notnull()]['ip4_address']))
-        ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip6_address.notnull()]['ip6_address']))
+        ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', ipv6_addresses)
+
+        print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(host_id)} hosts, {len(ip4_id)} IPv4, '
+              f'{len(ip6_id)} IPv6')
+
+        # Compute links
         res_links = []
         mng_links = []
+        partof_links = []
 
-        print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(ip4_id)} IPv4, {len(ip6_id)} IPv6')
-        if self.additional_domain_type:
-            print(f'Added "{self.additional_domain_type}" label to {len(additional_id)} nodes.')
-
+        # RESOLVES_TO and MANAGED_BY links
         for row in df.itertuples():
-            domain_qid = domain_id[row.query_name]
+
+            # NS Record
+            if row.response_type == 'NS' and row.ns_address:
+                domain_qid = domain_id[row.query_name]
+                ns_qid = ns_id[row.ns_address]
+                mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]})
 
             # A Record
-            if row.response_type == 'A' and row.ip4_address:
+            elif row.response_type == 'A' and row.ip4_address:
+                host_qid = host_id[row.query_name]
                 ip_qid = ip4_id[row.ip4_address]
-                res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]})
+                res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]})
 
             # AAAA Record
             elif row.response_type == 'AAAA' and row.ip6_address:
-                ip_qid = ip6_id[row.ip6_address]
-                res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]})
-
-            # NS Record
-            elif row.response_type == 'NS' and row.ns_address:
-                ns_qid = ns_id[row.ns_address]
-                mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]})
+                try:
+                    ip_normalized = IPv6Address(row.ip6_address).compressed
+                except ValueError:
+                    # Error message was already logged above.
+                    continue
+                host_qid = host_id[row.query_name]
+                ip_qid = ip6_id[ip_normalized]
+                res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]})
+
+        # PART_OF links between HostNames and DomainNames
+        for hd in host_names.intersection(domain_names):
+            partof_links.append({'src_id': host_id[hd], 'dst_id': domain_id[hd], 'props': [self.reference]})
 
         print(f'Computed {len(res_links)} RESOLVES_TO links and {len(mng_links)} MANAGED_BY links')
 
         # Push all links to IYP
         self.iyp.batch_add_links('RESOLVES_TO', res_links)
         self.iyp.batch_add_links('MANAGED_BY', mng_links)
+        self.iyp.batch_add_links('PART_OF', partof_links)
diff --git a/iyp/crawlers/openintel/infra_mx.py b/iyp/crawlers/openintel/infra_mx.py
index 24266d0..3841c77 100644
--- a/iyp/crawlers/openintel/infra_mx.py
+++ b/iyp/crawlers/openintel/infra_mx.py
@@ -19,6 +19,12 @@ def __init__(self, organization, url, name):
 
 
 def main() -> None:
+
+    ############################################
+    # This crawler is not working the NODE_TYPE argument has been deprecated
+    ############################################
+    return
+
     parser = argparse.ArgumentParser()
     parser.add_argument('--unit-test', action='store_true')
     args = parser.parse_args()
diff --git a/iyp/crawlers/openintel/infra_ns.py b/iyp/crawlers/openintel/infra_ns.py
index 776f1aa..a4395c6 100644
--- a/iyp/crawlers/openintel/infra_ns.py
+++ b/iyp/crawlers/openintel/infra_ns.py
@@ -10,12 +10,11 @@
 NAME = 'openintel.infra_ns'
 
 DATASET = 'infra:ns'
-NODE_TYPE = 'AuthoritativeNameServer'
 
 
 class Crawler(OpenIntelCrawler):
     def __init__(self, organization, url, name):
-        super().__init__(organization, url, name, DATASET, NODE_TYPE)
+        super().__init__(organization, url, name, DATASET)
 
 
 def main() -> None:
diff --git a/iyp/crawlers/ripe/README.md b/iyp/crawlers/ripe/README.md
index f56196b..ce54b07 100644
--- a/iyp/crawlers/ripe/README.md
+++ b/iyp/crawlers/ripe/README.md
@@ -52,7 +52,7 @@ ASN(s), and country.
 We fetch the [list of
 measurements](https://atlas.ripe.net/docs/apis/rest-api-manual/measurements/)
 to obtain metadata of *ongoing* Atlas measurements.  `AtlasProbe`s are `PART_OF`
-`AtlasMeasurement`s and measurements `TARGET` one or more `IP`s, a `DomainName`, or
+`AtlasMeasurement`s and measurements `TARGET` one or more `IP`s, a `HostName`, or
 both. The Atlas platform also maps the measurement target to an `AS` number if possible.
 The crawler includes this relationship as well.
 
@@ -61,7 +61,7 @@ never connected or are abandoned.
 
 ```Cypher
 (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:AS {asn: 2497})
-(:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:DomainName {name: 'jp-tyo-as2497.anchors.atlas.ripe.net'})
+(:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:HostName {name: 'jp-tyo-as2497.anchors.atlas.ripe.net'})
 (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:IP {ip: '202.214.87.158'})
 ```
 
diff --git a/iyp/crawlers/ripe/atlas_measurements.py b/iyp/crawlers/ripe/atlas_measurements.py
index d0f9aac..79770f0 100644
--- a/iyp/crawlers/ripe/atlas_measurements.py
+++ b/iyp/crawlers/ripe/atlas_measurements.py
@@ -73,7 +73,7 @@ def __transform_data(data):
             # on the '_' delimiter, this action would potentially
             # cause a TypeError from flatdict if it isn't handled properly.
             target_info = {
-                'domain': item.pop('target', None),
+                'hostname': item.pop('target', None),
                 'asn': item.pop('target_asn', None),
                 'ip': item.pop('target_ip', None),
                 'prefix': item.pop('target_prefix', None),
@@ -148,7 +148,7 @@ def run(self):
         probe_ids = set()
         ips = set()
         ases = set()
-        domains = set()
+        hostnames = set()
 
         valid_probe_measurements = list()
 
@@ -171,16 +171,16 @@ def run(self):
                 probe_af = int(probe_measurement['af'])
                 resolved_ips[i] = ipaddress.ip_address(resolved_ips[i]).compressed if probe_af == 6 else resolved_ips[i]
 
-            domain = probe_measurement['target']['domain']
-            if domain == '' or self.__is_valid_ip(domain):
-                domain = None
-                probe_measurement['target']['domain'] = None
+            hostname = probe_measurement['target']['hostname']
+            if hostname == '' or self.__is_valid_ip(hostname):
+                hostname = None
+                probe_measurement['target']['hostname'] = None
 
             asn = probe_measurement['target']['asn']
             probe_ids_participated = probe_measurement['current_probes']
 
             self.__add_if_not_none(probe_measurement_id, probe_measurement_ids)
-            self.__add_if_not_none(domain, domains)
+            self.__add_if_not_none(hostname, hostnames)
             self.__add_if_not_none(asn, ases)
             self.__add_if_not_none_values(resolved_ips, ips)
             self.__add_if_not_none_values(probe_ids_participated, probe_ids)
@@ -204,8 +204,8 @@ def run(self):
         probe_ids = self.iyp.batch_get_nodes_by_single_prop('AtlasProbe', 'id', probe_ids, all=False, create=True)
         logging.info(f'{len(ips)} IPs')
         ip_ids = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', ips, all=False, create=True)
-        logging.info(f'{len(domains)} domains')
-        domain_ids = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains, all=False, create=True)
+        logging.info(f'{len(hostnames)} hostnames')
+        hostname_ids = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', hostnames, all=False, create=True)
         logging.info(f'{len(ases)} ASNs')
         asn_ids = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', ases, all=False, create=True)
 
@@ -226,10 +226,10 @@ def run(self):
                 target_links.append({'src_id': probe_measurement_qid, 'dst_id': asn_qid,
                                     'props': [probe_measurement_reference]})
 
-            probe_measurement_domain = probe_measurement['target']['domain']
-            if probe_measurement_domain:
-                domain_qid = domain_ids[probe_measurement_domain]
-                target_links.append({'src_id': probe_measurement_qid, 'dst_id': domain_qid,
+            probe_measurement_hostname = probe_measurement['target']['hostname']
+            if probe_measurement_hostname:
+                hostname_qid = hostname_ids[probe_measurement_hostname]
+                target_links.append({'src_id': probe_measurement_qid, 'dst_id': hostname_qid,
                                     'props': [probe_measurement_reference]})
 
             probe_measurement_ips = self.__get_all_resolved_ips(probe_measurement)
diff --git a/iyp/post/url2domain.py b/iyp/post/url2hostname.py
similarity index 68%
rename from iyp/post/url2domain.py
rename to iyp/post/url2hostname.py
index 6a89f94..6429a3f 100644
--- a/iyp/post/url2domain.py
+++ b/iyp/post/url2hostname.py
@@ -8,27 +8,27 @@
 
 class PostProcess(BasePostProcess):
     def run(self):
-        """Link URLs and their corresponding DomainNames."""
+        """Link URLs and their corresponding HostNames."""
 
         # Get all URL nodes.
         url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url')
 
-        # Get all DomainName Nodes
-        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name')
+        # Get all HostName Nodes
+        hostname_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name')
 
         # Compute links
         links = []
         for url, url_qid in url_id.items():
-            # Extract domain name from URL
-            domain = tldextract.extract(url).registered_domain
+            # Extract host name from URL
+            hostname = tldextract.extract(url).fqdn
 
-            # Get DomainName node for the domain
-            domain_qid = domain_id.get(domain)
+            # Get HostName node for the fqdn of the URL
+            hostname_qid = hostname_id.get(hostname)
 
-            if domain_qid is not None:
+            if hostname_qid is not None:
                 links.append({
                     'src_id': url_qid,
-                    'dst_id': domain_qid,
+                    'dst_id': hostname_qid,
                     'props': [self.reference]
                 })
 

From c67213184f42e21d6a7d62d59a593a2a24c43c9e Mon Sep 17 00:00:00 2001
From: Malte Tashiro <malte@iij.ad.jp>
Date: Wed, 7 Feb 2024 09:06:10 +0000
Subject: [PATCH 2/2] Add OpenINTEL DNS dependency crawler

Integrate with existing files and remove some unnecessary stuff.

Co-authored-by: Raffaele Sommese <r.sommese@utwente.nl>
---
 config.json.example                         |   2 +
 iyp/crawlers/openintel/__init__.py          | 145 ++++++++++++++++----
 iyp/crawlers/openintel/dns_dependency_jp.py |  45 ++++++
 iyp/crawlers/openintel/dns_dependency_nl.py |  45 ++++++
 4 files changed, 213 insertions(+), 24 deletions(-)
 create mode 100644 iyp/crawlers/openintel/dns_dependency_jp.py
 create mode 100644 iyp/crawlers/openintel/dns_dependency_nl.py

diff --git a/config.json.example b/config.json.example
index 81134ca..a281435 100644
--- a/config.json.example
+++ b/config.json.example
@@ -81,6 +81,8 @@
             "iyp.crawlers.alice_lg.linx",
             "iyp.crawlers.alice_lg.megaport",
             "iyp.crawlers.alice_lg.netnod",
+            "iyp.crawlers.openintel.dns_dependency_nl",
+            "iyp.crawlers.openintel.dns_dependency_jp",
             "iyp.crawlers.cloudflare.dns_top_locations",
             "iyp.crawlers.cloudflare.dns_top_ases"
         ],
diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py
index d255205..a8a6dbc 100644
--- a/iyp/crawlers/openintel/__init__.py
+++ b/iyp/crawlers/openintel/__init__.py
@@ -1,12 +1,12 @@
 # Simple Python script to fetch domain name to IP address mappings from OpenINTEL data
-# Based on code from Mattijs Jonker <m.jonker@utwente.nl>
+# OpenIntelCrawler is based on code from Mattijs Jonker <m.jonker@utwente.nl>
 
 import argparse
-import datetime
 import json
 import logging
 import os
 import tempfile
+from datetime import datetime, timedelta, timezone
 from ipaddress import IPv6Address
 
 import arrow
@@ -15,15 +15,11 @@
 import pandas as pd
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 TMP_DIR = './tmp'
 os.makedirs(TMP_DIR, exist_ok=True)
 
-URL = 'https://data.openintel.nl/data/'
-ORG = 'OpenINTEL'
-NAME = 'openintel.*'
-
 # credentials
 OPENINTEL_ACCESS_KEY = ''
 OPENINTEL_SECRET_KEY = ''
@@ -36,7 +32,7 @@
 
 def valid_date(s):
     try:
-        return datetime.datetime.strptime(s, '%Y-%m-%d')
+        return datetime.strptime(s, '%Y-%m-%d')
     except ValueError:
         msg = 'not a valid ISO 8601 date: {0!r}'.format(s)
         raise argparse.ArgumentTypeError(msg)
@@ -78,22 +74,24 @@ def get_parquet(self):
 
         # check on the website if yesterday's data is available
         yesterday = arrow.utcnow().shift(days=-1)
-        url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
-        try:
-            req = requests.head(url)
-
-            attempt = 3
-            while req.status_code != 200 and attempt > 0:
-                print(req.status_code)
-                attempt -= 1
-                yesterday = yesterday.shift(days=-1)
-                url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
-                req = requests.head(url)
-
-        except requests.exceptions.ConnectionError:
-            logging.warning("Cannot reach OpenINTEL website, try yesterday's data")
-            yesterday = arrow.utcnow().shift(days=-1)
-            url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
+        # FIXME Check at the proper place. Remove flake8 exception afterwards.
+        # flake8: noqa
+        # url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
+        # try:
+        #     req = requests.head(url)
+
+        #     attempt = 3
+        #     while req.status_code != 200 and attempt > 0:
+        #         print(req.status_code)
+        #         attempt -= 1
+        #         yesterday = yesterday.shift(days=-1)
+        #         url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
+        #         req = requests.head(url)
+
+        # except requests.exceptions.ConnectionError:
+        #     logging.warning("Cannot reach OpenINTEL website, try yesterday's data")
+        #     yesterday = arrow.utcnow().shift(days=-1)
+        #     url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
 
         logging.warning(f'Fetching data for {yesterday}')
 
@@ -245,3 +243,102 @@ def run(self):
         self.iyp.batch_add_links('RESOLVES_TO', res_links)
         self.iyp.batch_add_links('MANAGED_BY', mng_links)
         self.iyp.batch_add_links('PART_OF', partof_links)
+
+
+class DnsDependencyCrawler(BaseCrawler):
+
+    def __init__(self, organization, url, name):
+        super().__init__(organization, url, name)
+
+    def run(self):
+        # Extract current date for partitioning
+        logging.info('Probing available data')
+        max_lookback_in_weeks = 1
+        for lookback in range(0, max_lookback_in_weeks + 1):
+            current_date = datetime.now(tz=timezone.utc) - timedelta(weeks=lookback)
+            year = current_date.strftime('%Y')
+            week = current_date.strftime('%U')
+            base_url = f'{self.reference["reference_url"]}/year={year}/week={week}'
+            probe_url = f'{base_url}/domain_nodes.json.gz'
+            if requests.head(probe_url).ok:
+                logging.info(f'Using year={year}/week={week} ({current_date.strftime("%Y-%m-%d")})')
+                break
+        else:
+            logging.error('Failed to find data within the specified lookback interval.')
+            raise RequestStatusError('Failed to find data within the specified lookback interval.')
+
+        logging.info('Reading domain names')
+        domains = pd.read_json(f'{base_url}/domain_nodes.json.gz', lines=True)
+        logging.info('Reading host names')
+        hosts = pd.read_json(f'{base_url}/host_nodes.json.gz', lines=True)
+        logging.info('Reading IPs')
+        ips = pd.read_json(f'{base_url}/ip_nodes.json.gz', lines=True)
+        logging.info('Reading connections')
+        connections = pd.read_json(f'{base_url}/connections.json.gz', lines=True)
+
+        unique_domain_names = set(domains['name'])
+        unique_host_names = set(hosts['name'])
+        unique_ips = set(ips['address'])
+        logging.info(f'Pushing/getting {len(unique_domain_names)} DomainName {len(unique_host_names)} HostName '
+                     f'{len(unique_ips)} IP nodes...')
+        domains_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', unique_domain_names)
+        hosts_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', unique_host_names)
+        ips_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', unique_ips)
+
+        links_parent = list()
+        links_part_of = list()
+        links_alias_of = list()
+        links_managed_by = list()
+        links_resolves_to = list()
+
+        logging.info('Computing relationships...')
+        start_ts = datetime.now().timestamp()
+        for index, connection in connections.iterrows():
+            if connection['relation_name'] == 'PARENT':
+                links_parent.append({
+                    'src_id': domains_id[connection['from_nodeKey']],
+                    'dst_id': domains_id[connection['to_nodeKey']],
+                    'props': [self.reference, connection['properties']],
+                })
+            elif connection['relation_name'] == 'MANAGED_BY':
+                links_managed_by.append({
+                    'src_id': domains_id[connection['from_nodeKey']],
+                    'dst_id': hosts_id[connection['to_nodeKey']],
+                    'props': [self.reference, connection['properties']],
+                })
+            elif connection['relation_name'] == 'PART_OF':
+                links_part_of.append({
+                    'src_id': hosts_id[connection['from_nodeKey']],
+                    'dst_id': domains_id[connection['to_nodeKey']],
+                    'props': [self.reference, connection['properties']],
+                })
+            elif connection['relation_name'] == 'ALIAS_OF':
+                links_alias_of.append({
+                    'src_id': hosts_id[connection['from_nodeKey']],
+                    'dst_id': hosts_id[connection['to_nodeKey']],
+                    'props': [self.reference, connection['properties']],
+                })
+            elif connection['relation_name'] == 'RESOLVES_TO':
+                links_resolves_to.append({
+                    'src_id': hosts_id[connection['from_nodeKey']],
+                    'dst_id': ips_id[connection['to_nodeKey']],
+                    'props': [self.reference, connection['properties']],
+                })
+            else:
+                logging.error(f'Unknown relationship type: {connection["relation_name"]}')
+        stop_ts = datetime.now().timestamp()
+        logging.info(f'{stop_ts - start_ts:.2f}s elapsed')
+
+        # Push all links to IYP
+        logging.info(f'Pushing {len(links_parent)} PARENT {len(links_part_of)} PART_OF {len(links_alias_of)} ALIAS_OF '
+                     f'{len(links_managed_by)} MANAGED_BY {len(links_resolves_to)} RESOLVES_TO relationships...')
+        self.iyp.batch_add_links('PARENT', links_parent)
+        self.iyp.batch_add_links('PART_OF', links_part_of)
+        self.iyp.batch_add_links('ALIAS_OF', links_alias_of)
+        self.iyp.batch_add_links('MANAGED_BY', links_managed_by)
+        self.iyp.batch_add_links('RESOLVES_TO', links_resolves_to)
+
+        # Push the Authoritative NS Label
+        ns_id = [link['dst_id'] for link in links_managed_by]
+        logging.info(f'Adding AuthoritativeNameServer label to {len(ns_id)} nodes')
+        self.iyp.batch_add_node_label(ns_id, 'AuthoritativeNameServer')
diff --git a/iyp/crawlers/openintel/dns_dependency_jp.py b/iyp/crawlers/openintel/dns_dependency_jp.py
new file mode 100644
index 0000000..3a3993f
--- /dev/null
+++ b/iyp/crawlers/openintel/dns_dependency_jp.py
@@ -0,0 +1,45 @@
+import argparse
+import logging
+import os
+import sys
+
+from iyp.crawlers.openintel import DnsDependencyCrawler
+
+URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/JP'
+ORG = 'OpenINTEL'
+NAME = 'openintel.dns_dependency_jp'
+
+
+class Crawler(DnsDependencyCrawler):
+    def __init__(self, organization, url, name):
+        super().__init__(organization, url, name)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--unit-test', action='store_true')
+    args = parser.parse_args()
+
+    scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3]
+    FORMAT = '%(asctime)s %(levelname)s %(message)s'
+    logging.basicConfig(
+        format=FORMAT,
+        filename='log/' + scriptname + '.log',
+        level=logging.INFO,
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+    logging.info(f'Started: {sys.argv}')
+
+    crawler = Crawler(ORG, URL, NAME)
+    if args.unit_test:
+        crawler.unit_test(logging)
+    else:
+        crawler.run()
+        crawler.close()
+    logging.info(f'Finished: {sys.argv}')
+
+
+if __name__ == '__main__':
+    main()
+    sys.exit(0)
diff --git a/iyp/crawlers/openintel/dns_dependency_nl.py b/iyp/crawlers/openintel/dns_dependency_nl.py
new file mode 100644
index 0000000..04c6729
--- /dev/null
+++ b/iyp/crawlers/openintel/dns_dependency_nl.py
@@ -0,0 +1,45 @@
+import argparse
+import logging
+import os
+import sys
+
+from iyp.crawlers.openintel import DnsDependencyCrawler
+
+URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/NL'
+ORG = 'OpenINTEL'
+NAME = 'openintel.dns_dependency_nl'
+
+
+class Crawler(DnsDependencyCrawler):
+    def __init__(self, organization, url, name):
+        super().__init__(organization, url, name)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--unit-test', action='store_true')
+    args = parser.parse_args()
+
+    scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3]
+    FORMAT = '%(asctime)s %(levelname)s %(message)s'
+    logging.basicConfig(
+        format=FORMAT,
+        filename='log/' + scriptname + '.log',
+        level=logging.INFO,
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+    logging.info(f'Started: {sys.argv}')
+
+    crawler = Crawler(ORG, URL, NAME)
+    if args.unit_test:
+        crawler.unit_test(logging)
+    else:
+        crawler.run()
+        crawler.close()
+    logging.info(f'Finished: {sys.argv}')
+
+
+if __name__ == '__main__':
+    main()
+    sys.exit(0)