From 838c8bed6936fa473d6510bb18c9f89f48ab177b Mon Sep 17 00:00:00 2001 From: Romain Date: Wed, 7 Feb 2024 16:55:53 +0900 Subject: [PATCH 1/2] DNS remodeling (#119) * update url2domain to url2hostname * remove iana root zone file and dns hierarchy from config file * Atlas measurement targets are now hostnames * update openintel crawlers to the new DNS model * umbrella now ranks a mix of DomainName and HostName nodes and should be run after openintel.umbrella1m * Add explanation for cloudflare DNS modeling * lower umbrella crawler in config file * update READMEs with the new DNS modeling * add (:Service {name:'DNS'}) node and link it to authoritative name servers * Nodes do not have reference properties * Normalize IPv6 addresses * Fix wrong crawler name * Typos and formatting * Remove infra_mx crawler since it does not do anything at the moment * Update Cisco Umbrella crawler - Batch create new nodes (happens more often than expected) - Add logging output - Do not use builtins as variable names * Remove redundant set and parameters * Remove Service node for now We could not decide on a name, so we will deal with this later. --------- Co-authored-by: Malte Tashiro --- config.json.example | 7 +- iyp/crawlers/cisco/README.md | 8 +- iyp/crawlers/cisco/umbrella_top1M.py | 61 +++++++++++-- iyp/crawlers/cloudflare/README.md | 8 +- iyp/crawlers/cloudflare/dns_top_ases.py | 5 ++ iyp/crawlers/cloudflare/dns_top_locations.py | 5 ++ iyp/crawlers/cloudflare/ranking_bucket.py | 3 + iyp/crawlers/cloudflare/top100.py | 3 + iyp/crawlers/openintel/README.md | 20 +++-- iyp/crawlers/openintel/__init__.py | 95 ++++++++++++-------- iyp/crawlers/openintel/infra_mx.py | 6 ++ iyp/crawlers/openintel/infra_ns.py | 3 +- iyp/crawlers/ripe/README.md | 4 +- iyp/crawlers/ripe/atlas_measurements.py | 26 +++--- iyp/post/{url2domain.py => url2hostname.py} | 18 ++-- 15 files changed, 183 insertions(+), 89 deletions(-) rename iyp/post/{url2domain.py => url2hostname.py} (68%) diff --git a/config.json.example b/config.json.example index abace94..81134ca 100644 --- a/config.json.example +++ b/config.json.example @@ -63,11 +63,10 @@ "iyp.crawlers.peeringdb.ix", "iyp.crawlers.cloudflare.top100", "iyp.crawlers.tranco.top1M", - "iyp.crawlers.cisco.umbrella_top1M", "iyp.crawlers.openintel.tranco1m", "iyp.crawlers.openintel.umbrella1m", "iyp.crawlers.openintel.infra_ns", - "iyp.crawlers.openintel.infra_mx", + "iyp.crawlers.cisco.umbrella_top1M", "iyp.crawlers.citizenlab.urldb", "iyp.crawlers.inetintel.as_org", "iyp.crawlers.pch.daily_routing_snapshots_v4", @@ -75,7 +74,6 @@ "iyp.crawlers.emileaben.as_names", "iyp.crawlers.ripe.atlas_probes", "iyp.crawlers.ripe.atlas_measurements", - "iyp.crawlers.iana.root_zone", "iyp.crawlers.alice_lg.amsix", "iyp.crawlers.alice_lg.bcix", "iyp.crawlers.alice_lg.decix", @@ -91,8 +89,7 @@ "iyp.post.ip2prefix", "iyp.post.address_family", "iyp.post.country_information", - "iyp.post.dns_hierarchy", - "iyp.post.url2domain" + "iyp.post.url2hostname" ] } } diff --git a/iyp/crawlers/cisco/README.md b/iyp/crawlers/cisco/README.md index 24d1811..839402e 100644 --- a/iyp/crawlers/cisco/README.md +++ b/iyp/crawlers/cisco/README.md @@ -1,8 +1,9 @@ # Cisco Umbrella -- https://umbrella-static.s3-us-west-1.amazonaws.com/index.html -The popularity list contains most queried domains based on passive DNS usage across the Umbrella global network. +The popularity list contains most queried domains (ranging from TLDs to FQDNs) +based on passive DNS usage across the Umbrella global network. -IYP uses this data to create and annotate DomainName nodes. +IYP uses this data to create and annotate DomainName and HostName nodes. ## Graph representation @@ -10,8 +11,9 @@ The rank of the domain is indicated by the `rank` property of the relationship. ```Cypher (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'}) +(:HostName {name: 'www.google.com'})-[:RANK {rank: 8}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'}) ``` ## Dependence -This crawler is not depending on other crawlers. +This crawler depends on `openintel.umbrella1m`. diff --git a/iyp/crawlers/cisco/umbrella_top1M.py b/iyp/crawlers/cisco/umbrella_top1M.py index 629f15b..714681b 100644 --- a/iyp/crawlers/cisco/umbrella_top1M.py +++ b/iyp/crawlers/cisco/umbrella_top1M.py @@ -6,6 +6,7 @@ from zipfile import ZipFile import requests +import tldextract from iyp import BaseCrawler, RequestStatusError @@ -22,31 +23,75 @@ def run(self): self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'}) - sys.stderr.write('Downloading latest list...\n') + logging.info('Downloading latest list...') req = requests.get(URL) if req.status_code != 200: raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file') links = [] - domains = set() # open zip file and read top list with ZipFile(io.BytesIO(req.content)) as z: - with z.open('top-1m.csv') as list: - for i, row in enumerate(io.TextIOWrapper(list)): + with z.open('top-1m.csv') as top_list: + for i, row in enumerate(io.TextIOWrapper(top_list)): row = row.rstrip() rank, domain = row.split(',') - domains.add(domain) links.append({'src_name': domain, 'dst_id': self.cisco_qid, 'props': [self.reference, {'rank': int(rank)}]}) - name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains) + logging.info('Fetching DomainName/HostName nodes...') + domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name') + host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name') + # Umbrella mixes up domain and host names. + # By order of preferences we rank: + # 1) existing domain name + # 2) existing host name + # 3) do our best to figure out if it is a domain or host and create the + # corresponding node + + new_domain_names = set() + new_host_names = set() + unprocessed_links = list() + processed_links = list() + + logging.info('Building relationships...') for link in links: - link['src_id'] = name_id[link['src_name']] + if link['src_name'] in domain_id: + link['src_id'] = domain_id[link['src_name']] + processed_links.append(link) + elif link['src_name'] in host_id: + link['src_id'] = host_id[link['src_name']] + processed_links.append(link) + else: + unprocessed_links.append(link) + ranked_thing = tldextract.extract(link['src_name']) + name = link['src_name'] + if name == ranked_thing.registered_domain: + new_domain_names.add(name) + else: + new_host_names.add(name) + + if new_domain_names: + logging.info(f'Pushing {len(new_domain_names)} additional DomainName nodes...') + domain_id.update(self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', new_domain_names, all=False)) + if new_host_names: + logging.info(f'Pushing {len(new_host_names)} additional HostName nodes...') + host_id.update(self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', new_host_names, all=False)) + + for link in unprocessed_links: + if link['src_name'] in domain_id: + link['src_id'] = domain_id[link['src_name']] + elif link['src_name'] in host_id: + link['src_id'] = host_id[link['src_name']] + else: + logging.error(f'Missing DomainName/HostName node for name "{link["src_name"]}". Should not happen.') + continue + processed_links.append(link) # Push all links to IYP - self.iyp.batch_add_links('RANK', links) + logging.info(f'Pushing {len(processed_links)} RANK relationships...') + self.iyp.batch_add_links('RANK', processed_links) def main() -> None: diff --git a/iyp/crawlers/cloudflare/README.md b/iyp/crawlers/cloudflare/README.md index 4ba886f..7ce4aee 100644 --- a/iyp/crawlers/cloudflare/README.md +++ b/iyp/crawlers/cloudflare/README.md @@ -1,4 +1,4 @@ -# Cloudflare Radar -- https://radar.cloudflare.com/ +# Cloudflare Radar -- https://radar.cloudflare.com/ Cloudflare uses aggregated and anonymized DNS queries to their `1.1.1.1` public resolver service to provide various datasets, including: @@ -17,8 +17,12 @@ provide various datasets, including: - [Top 100 ASes querying each of the 10,000 highest ranked domain names](https://developers.cloudflare.com/api/operations/radar_get__top_ases): Same as above, but fetch AS numbers instead. - + All rankings are based on one week of data. +Cloudflare radar's top location and ASes is available for both domain names +and host names. Results are likely accounting for all NS, A, AAAA queries made to +Cloudflare's resolver. Since NS queries for host names make no sense IYP links these +results to `DomainName` nodes. ## Graph representation diff --git a/iyp/crawlers/cloudflare/dns_top_ases.py b/iyp/crawlers/cloudflare/dns_top_ases.py index f24c952..8c15ac0 100644 --- a/iyp/crawlers/cloudflare/dns_top_ases.py +++ b/iyp/crawlers/cloudflare/dns_top_ases.py @@ -1,3 +1,8 @@ +# Cloudflare radar's top location and ASes is available for both domain names +# and host names. Results are likely accounting for all NS, A, AAAA queries made to +# Cloudflare's resolver. Since NS queries for host names make no sense it seems +# more intuitive to link these results to DomainName nodes. + import argparse import logging import os diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py index 4f4118f..46e46a0 100644 --- a/iyp/crawlers/cloudflare/dns_top_locations.py +++ b/iyp/crawlers/cloudflare/dns_top_locations.py @@ -1,3 +1,8 @@ +# Cloudflare radar's top location and ASes is available for both domain names +# and host names. Results are likely accounting for all NS, A, AAAA queries made to +# Cloudflare's resolver. Since NS queries for host names make no sense it seems +# more intuitive to link these results to DomainName nodes. + import argparse import glob import json diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py index 7193ad6..fbea3f0 100644 --- a/iyp/crawlers/cloudflare/ranking_bucket.py +++ b/iyp/crawlers/cloudflare/ranking_bucket.py @@ -24,6 +24,9 @@ class Crawler(BaseCrawler): # Base Crawler provides access to IYP via self.iyp and setup a dictionary with the # org/url/today's date in self.reference + # + # Cloudflare ranks second and third level domain names (not host names). + # See https://blog.cloudflare.com/radar-domain-rankings/ def run(self): """Fetch data and push to IYP.""" diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py index 5e2f5fb..d189da8 100644 --- a/iyp/crawlers/cloudflare/top100.py +++ b/iyp/crawlers/cloudflare/top100.py @@ -21,6 +21,9 @@ class Crawler(BaseCrawler): # Base Crawler provides access to IYP via self.iyp # and setup a dictionary with the org/url/today's date in self.reference + # + # Cloudflare ranks second and third level domain names (not host names). + # See https://blog.cloudflare.com/radar-domain-rankings/ def run(self): """Fetch data and push to IYP.""" diff --git a/iyp/crawlers/openintel/README.md b/iyp/crawlers/openintel/README.md index 70cf9c5..0f3ec5f 100644 --- a/iyp/crawlers/openintel/README.md +++ b/iyp/crawlers/openintel/README.md @@ -4,26 +4,34 @@ The OpenINTEL measurement platform captures daily snapshots of the state of larg global Domain Name System (DNS) by running a number of forward and reverse DNS measurements. While OpenINTEL runs measurements to a variety of domain names, IYP currently only fetches data for -the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella +the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella top 1 million list since it combines rankings. IYP also get the list of authoritative names servers seen by OpenINTEL. -IYP uses only `A` queries to add IP resolution for DomainName and AuthoritativeNameServer nodes. - A crawler of mail servers is also implemented but not used as it creates a very large number of links and this dataset is currently not requested/needed by anyone. ## Graph representation -IP resolution for popular domain names: +IP resolution for popular host names: + ```Cypher -(:DomainName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'}) +(:HostName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'}) ``` IP resolution of authoritative name servers: + +```Cypher +(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'}) +(:IP {ip: '216.239.32.10'})-[:SERVE]->(:Service {name: 'DNS'}) +``` + +Domain names managed by name servers: + ```Cypher -(:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'}) +(:DomainName {name: 'google.com'})-[:MANAGED_BY]->(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'}) ``` + ## Dependence This crawler is not depending on other crawlers. diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py index 75b3c05..d255205 100644 --- a/iyp/crawlers/openintel/__init__.py +++ b/iyp/crawlers/openintel/__init__.py @@ -7,6 +7,7 @@ import logging import os import tempfile +from ipaddress import IPv6Address import arrow import boto3 @@ -42,17 +43,11 @@ def valid_date(s): class OpenIntelCrawler(BaseCrawler): - def __init__(self, organization, url, name, dataset, additional_domain_type=str()): + def __init__(self, organization, url, name, dataset): """Initialization of the OpenIntel crawler requires the name of the dataset - (e.g. tranco or infra:ns). - - If the dataset contains special types of domain - names, an additional label can be specified (e.g., `AuthoritativeNameServer`) - that will be attached to the `DomainName` nodes. - """ + (e.g. tranco or infra:ns).""" self.dataset = dataset - self.additional_domain_type = additional_domain_type super().__init__(organization, url, name) def get_parquet(self): @@ -179,52 +174,74 @@ def run(self): print(f'Read {len(df)} unique records from {len(self.pandas_df_list)} Parquet file(s).') - # Only domain names from the `query_name` column that will receive the - # additional_domain_type label (if present). - query_domain_names = set(df['query_name']) - # Name server domain names. - ns_domain_names = set(df[df.ns_address.notnull()]['ns_address']) - # All domain names, including the ones from the name server column. - all_domain_names = query_domain_names.union(ns_domain_names) - # Create all DomainName nodes. - domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', all_domain_names) - # Get node IDs for NS nodes and add NS label. - ns_id = {name: domain_id[name] for name in ns_domain_names} + # query_names for NS records are domain names + domain_names = set(df[df.response_type == 'NS']['query_name']) + + # response values of NS records are name servers + name_servers = set(df[df.ns_address.notnull()]['ns_address']) + + # query_names for A and AAAA records are host names + host_names = set(df[(df.response_type == 'A') | (df.response_type == 'AAAA')]['query_name']) + + ipv6_addresses = set() + # Normalize IPv6 addresses. + for ip in df[df.ip6_address.notnull()]['ip6_address']: + try: + ip_normalized = IPv6Address(ip).compressed + except ValueError as e: + logging.error(f'Ignoring invalid IPv6 address "{ip}": {e}') + continue + ipv6_addresses.add(ip_normalized) + + # Get/create all nodes: + domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domain_names) + host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', host_names) + ns_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', name_servers) self.iyp.batch_add_node_label(list(ns_id.values()), 'AuthoritativeNameServer') - # Add additional node label if present. - additional_id = set() - if self.additional_domain_type and self.additional_domain_type != 'DomainName': - additional_id = {domain_id[name] for name in query_domain_names} - self.iyp.batch_add_node_label(list(additional_id), self.additional_domain_type) ip4_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip4_address.notnull()]['ip4_address'])) - ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip6_address.notnull()]['ip6_address'])) + ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', ipv6_addresses) + + print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(host_id)} hosts, {len(ip4_id)} IPv4, ' + f'{len(ip6_id)} IPv6') + + # Compute links res_links = [] mng_links = [] + partof_links = [] - print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(ip4_id)} IPv4, {len(ip6_id)} IPv6') - if self.additional_domain_type: - print(f'Added "{self.additional_domain_type}" label to {len(additional_id)} nodes.') - + # RESOLVES_TO and MANAGED_BY links for row in df.itertuples(): - domain_qid = domain_id[row.query_name] + + # NS Record + if row.response_type == 'NS' and row.ns_address: + domain_qid = domain_id[row.query_name] + ns_qid = ns_id[row.ns_address] + mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]}) # A Record - if row.response_type == 'A' and row.ip4_address: + elif row.response_type == 'A' and row.ip4_address: + host_qid = host_id[row.query_name] ip_qid = ip4_id[row.ip4_address] - res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]}) + res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]}) # AAAA Record elif row.response_type == 'AAAA' and row.ip6_address: - ip_qid = ip6_id[row.ip6_address] - res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]}) - - # NS Record - elif row.response_type == 'NS' and row.ns_address: - ns_qid = ns_id[row.ns_address] - mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]}) + try: + ip_normalized = IPv6Address(row.ip6_address).compressed + except ValueError: + # Error message was already logged above. + continue + host_qid = host_id[row.query_name] + ip_qid = ip6_id[ip_normalized] + res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]}) + + # PART_OF links between HostNames and DomainNames + for hd in host_names.intersection(domain_names): + partof_links.append({'src_id': host_id[hd], 'dst_id': domain_id[hd], 'props': [self.reference]}) print(f'Computed {len(res_links)} RESOLVES_TO links and {len(mng_links)} MANAGED_BY links') # Push all links to IYP self.iyp.batch_add_links('RESOLVES_TO', res_links) self.iyp.batch_add_links('MANAGED_BY', mng_links) + self.iyp.batch_add_links('PART_OF', partof_links) diff --git a/iyp/crawlers/openintel/infra_mx.py b/iyp/crawlers/openintel/infra_mx.py index 24266d0..3841c77 100644 --- a/iyp/crawlers/openintel/infra_mx.py +++ b/iyp/crawlers/openintel/infra_mx.py @@ -19,6 +19,12 @@ def __init__(self, organization, url, name): def main() -> None: + + ############################################ + # This crawler is not working the NODE_TYPE argument has been deprecated + ############################################ + return + parser = argparse.ArgumentParser() parser.add_argument('--unit-test', action='store_true') args = parser.parse_args() diff --git a/iyp/crawlers/openintel/infra_ns.py b/iyp/crawlers/openintel/infra_ns.py index 776f1aa..a4395c6 100644 --- a/iyp/crawlers/openintel/infra_ns.py +++ b/iyp/crawlers/openintel/infra_ns.py @@ -10,12 +10,11 @@ NAME = 'openintel.infra_ns' DATASET = 'infra:ns' -NODE_TYPE = 'AuthoritativeNameServer' class Crawler(OpenIntelCrawler): def __init__(self, organization, url, name): - super().__init__(organization, url, name, DATASET, NODE_TYPE) + super().__init__(organization, url, name, DATASET) def main() -> None: diff --git a/iyp/crawlers/ripe/README.md b/iyp/crawlers/ripe/README.md index f56196b..ce54b07 100644 --- a/iyp/crawlers/ripe/README.md +++ b/iyp/crawlers/ripe/README.md @@ -52,7 +52,7 @@ ASN(s), and country. We fetch the [list of measurements](https://atlas.ripe.net/docs/apis/rest-api-manual/measurements/) to obtain metadata of *ongoing* Atlas measurements. `AtlasProbe`s are `PART_OF` -`AtlasMeasurement`s and measurements `TARGET` one or more `IP`s, a `DomainName`, or +`AtlasMeasurement`s and measurements `TARGET` one or more `IP`s, a `HostName`, or both. The Atlas platform also maps the measurement target to an `AS` number if possible. The crawler includes this relationship as well. @@ -61,7 +61,7 @@ never connected or are abandoned. ```Cypher (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:AS {asn: 2497}) -(:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:DomainName {name: 'jp-tyo-as2497.anchors.atlas.ripe.net'}) +(:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:HostName {name: 'jp-tyo-as2497.anchors.atlas.ripe.net'}) (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:IP {ip: '202.214.87.158'}) ``` diff --git a/iyp/crawlers/ripe/atlas_measurements.py b/iyp/crawlers/ripe/atlas_measurements.py index d0f9aac..79770f0 100644 --- a/iyp/crawlers/ripe/atlas_measurements.py +++ b/iyp/crawlers/ripe/atlas_measurements.py @@ -73,7 +73,7 @@ def __transform_data(data): # on the '_' delimiter, this action would potentially # cause a TypeError from flatdict if it isn't handled properly. target_info = { - 'domain': item.pop('target', None), + 'hostname': item.pop('target', None), 'asn': item.pop('target_asn', None), 'ip': item.pop('target_ip', None), 'prefix': item.pop('target_prefix', None), @@ -148,7 +148,7 @@ def run(self): probe_ids = set() ips = set() ases = set() - domains = set() + hostnames = set() valid_probe_measurements = list() @@ -171,16 +171,16 @@ def run(self): probe_af = int(probe_measurement['af']) resolved_ips[i] = ipaddress.ip_address(resolved_ips[i]).compressed if probe_af == 6 else resolved_ips[i] - domain = probe_measurement['target']['domain'] - if domain == '' or self.__is_valid_ip(domain): - domain = None - probe_measurement['target']['domain'] = None + hostname = probe_measurement['target']['hostname'] + if hostname == '' or self.__is_valid_ip(hostname): + hostname = None + probe_measurement['target']['hostname'] = None asn = probe_measurement['target']['asn'] probe_ids_participated = probe_measurement['current_probes'] self.__add_if_not_none(probe_measurement_id, probe_measurement_ids) - self.__add_if_not_none(domain, domains) + self.__add_if_not_none(hostname, hostnames) self.__add_if_not_none(asn, ases) self.__add_if_not_none_values(resolved_ips, ips) self.__add_if_not_none_values(probe_ids_participated, probe_ids) @@ -204,8 +204,8 @@ def run(self): probe_ids = self.iyp.batch_get_nodes_by_single_prop('AtlasProbe', 'id', probe_ids, all=False, create=True) logging.info(f'{len(ips)} IPs') ip_ids = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', ips, all=False, create=True) - logging.info(f'{len(domains)} domains') - domain_ids = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains, all=False, create=True) + logging.info(f'{len(hostnames)} hostnames') + hostname_ids = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', hostnames, all=False, create=True) logging.info(f'{len(ases)} ASNs') asn_ids = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', ases, all=False, create=True) @@ -226,10 +226,10 @@ def run(self): target_links.append({'src_id': probe_measurement_qid, 'dst_id': asn_qid, 'props': [probe_measurement_reference]}) - probe_measurement_domain = probe_measurement['target']['domain'] - if probe_measurement_domain: - domain_qid = domain_ids[probe_measurement_domain] - target_links.append({'src_id': probe_measurement_qid, 'dst_id': domain_qid, + probe_measurement_hostname = probe_measurement['target']['hostname'] + if probe_measurement_hostname: + hostname_qid = hostname_ids[probe_measurement_hostname] + target_links.append({'src_id': probe_measurement_qid, 'dst_id': hostname_qid, 'props': [probe_measurement_reference]}) probe_measurement_ips = self.__get_all_resolved_ips(probe_measurement) diff --git a/iyp/post/url2domain.py b/iyp/post/url2hostname.py similarity index 68% rename from iyp/post/url2domain.py rename to iyp/post/url2hostname.py index 6a89f94..6429a3f 100644 --- a/iyp/post/url2domain.py +++ b/iyp/post/url2hostname.py @@ -8,27 +8,27 @@ class PostProcess(BasePostProcess): def run(self): - """Link URLs and their corresponding DomainNames.""" + """Link URLs and their corresponding HostNames.""" # Get all URL nodes. url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url') - # Get all DomainName Nodes - domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name') + # Get all HostName Nodes + hostname_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name') # Compute links links = [] for url, url_qid in url_id.items(): - # Extract domain name from URL - domain = tldextract.extract(url).registered_domain + # Extract host name from URL + hostname = tldextract.extract(url).fqdn - # Get DomainName node for the domain - domain_qid = domain_id.get(domain) + # Get HostName node for the fqdn of the URL + hostname_qid = hostname_id.get(hostname) - if domain_qid is not None: + if hostname_qid is not None: links.append({ 'src_id': url_qid, - 'dst_id': domain_qid, + 'dst_id': hostname_qid, 'props': [self.reference] }) From c67213184f42e21d6a7d62d59a593a2a24c43c9e Mon Sep 17 00:00:00 2001 From: Malte Tashiro Date: Wed, 7 Feb 2024 09:06:10 +0000 Subject: [PATCH 2/2] Add OpenINTEL DNS dependency crawler Integrate with existing files and remove some unnecessary stuff. Co-authored-by: Raffaele Sommese --- config.json.example | 2 + iyp/crawlers/openintel/__init__.py | 145 ++++++++++++++++---- iyp/crawlers/openintel/dns_dependency_jp.py | 45 ++++++ iyp/crawlers/openintel/dns_dependency_nl.py | 45 ++++++ 4 files changed, 213 insertions(+), 24 deletions(-) create mode 100644 iyp/crawlers/openintel/dns_dependency_jp.py create mode 100644 iyp/crawlers/openintel/dns_dependency_nl.py diff --git a/config.json.example b/config.json.example index 81134ca..a281435 100644 --- a/config.json.example +++ b/config.json.example @@ -81,6 +81,8 @@ "iyp.crawlers.alice_lg.linx", "iyp.crawlers.alice_lg.megaport", "iyp.crawlers.alice_lg.netnod", + "iyp.crawlers.openintel.dns_dependency_nl", + "iyp.crawlers.openintel.dns_dependency_jp", "iyp.crawlers.cloudflare.dns_top_locations", "iyp.crawlers.cloudflare.dns_top_ases" ], diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py index d255205..a8a6dbc 100644 --- a/iyp/crawlers/openintel/__init__.py +++ b/iyp/crawlers/openintel/__init__.py @@ -1,12 +1,12 @@ # Simple Python script to fetch domain name to IP address mappings from OpenINTEL data -# Based on code from Mattijs Jonker +# OpenIntelCrawler is based on code from Mattijs Jonker import argparse -import datetime import json import logging import os import tempfile +from datetime import datetime, timedelta, timezone from ipaddress import IPv6Address import arrow @@ -15,15 +15,11 @@ import pandas as pd import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError TMP_DIR = './tmp' os.makedirs(TMP_DIR, exist_ok=True) -URL = 'https://data.openintel.nl/data/' -ORG = 'OpenINTEL' -NAME = 'openintel.*' - # credentials OPENINTEL_ACCESS_KEY = '' OPENINTEL_SECRET_KEY = '' @@ -36,7 +32,7 @@ def valid_date(s): try: - return datetime.datetime.strptime(s, '%Y-%m-%d') + return datetime.strptime(s, '%Y-%m-%d') except ValueError: msg = 'not a valid ISO 8601 date: {0!r}'.format(s) raise argparse.ArgumentTypeError(msg) @@ -78,22 +74,24 @@ def get_parquet(self): # check on the website if yesterday's data is available yesterday = arrow.utcnow().shift(days=-1) - url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day) - try: - req = requests.head(url) - - attempt = 3 - while req.status_code != 200 and attempt > 0: - print(req.status_code) - attempt -= 1 - yesterday = yesterday.shift(days=-1) - url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day) - req = requests.head(url) - - except requests.exceptions.ConnectionError: - logging.warning("Cannot reach OpenINTEL website, try yesterday's data") - yesterday = arrow.utcnow().shift(days=-1) - url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day) + # FIXME Check at the proper place. Remove flake8 exception afterwards. + # flake8: noqa + # url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day) + # try: + # req = requests.head(url) + + # attempt = 3 + # while req.status_code != 200 and attempt > 0: + # print(req.status_code) + # attempt -= 1 + # yesterday = yesterday.shift(days=-1) + # url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day) + # req = requests.head(url) + + # except requests.exceptions.ConnectionError: + # logging.warning("Cannot reach OpenINTEL website, try yesterday's data") + # yesterday = arrow.utcnow().shift(days=-1) + # url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day) logging.warning(f'Fetching data for {yesterday}') @@ -245,3 +243,102 @@ def run(self): self.iyp.batch_add_links('RESOLVES_TO', res_links) self.iyp.batch_add_links('MANAGED_BY', mng_links) self.iyp.batch_add_links('PART_OF', partof_links) + + +class DnsDependencyCrawler(BaseCrawler): + + def __init__(self, organization, url, name): + super().__init__(organization, url, name) + + def run(self): + # Extract current date for partitioning + logging.info('Probing available data') + max_lookback_in_weeks = 1 + for lookback in range(0, max_lookback_in_weeks + 1): + current_date = datetime.now(tz=timezone.utc) - timedelta(weeks=lookback) + year = current_date.strftime('%Y') + week = current_date.strftime('%U') + base_url = f'{self.reference["reference_url"]}/year={year}/week={week}' + probe_url = f'{base_url}/domain_nodes.json.gz' + if requests.head(probe_url).ok: + logging.info(f'Using year={year}/week={week} ({current_date.strftime("%Y-%m-%d")})') + break + else: + logging.error('Failed to find data within the specified lookback interval.') + raise RequestStatusError('Failed to find data within the specified lookback interval.') + + logging.info('Reading domain names') + domains = pd.read_json(f'{base_url}/domain_nodes.json.gz', lines=True) + logging.info('Reading host names') + hosts = pd.read_json(f'{base_url}/host_nodes.json.gz', lines=True) + logging.info('Reading IPs') + ips = pd.read_json(f'{base_url}/ip_nodes.json.gz', lines=True) + logging.info('Reading connections') + connections = pd.read_json(f'{base_url}/connections.json.gz', lines=True) + + unique_domain_names = set(domains['name']) + unique_host_names = set(hosts['name']) + unique_ips = set(ips['address']) + logging.info(f'Pushing/getting {len(unique_domain_names)} DomainName {len(unique_host_names)} HostName ' + f'{len(unique_ips)} IP nodes...') + domains_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', unique_domain_names) + hosts_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', unique_host_names) + ips_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', unique_ips) + + links_parent = list() + links_part_of = list() + links_alias_of = list() + links_managed_by = list() + links_resolves_to = list() + + logging.info('Computing relationships...') + start_ts = datetime.now().timestamp() + for index, connection in connections.iterrows(): + if connection['relation_name'] == 'PARENT': + links_parent.append({ + 'src_id': domains_id[connection['from_nodeKey']], + 'dst_id': domains_id[connection['to_nodeKey']], + 'props': [self.reference, connection['properties']], + }) + elif connection['relation_name'] == 'MANAGED_BY': + links_managed_by.append({ + 'src_id': domains_id[connection['from_nodeKey']], + 'dst_id': hosts_id[connection['to_nodeKey']], + 'props': [self.reference, connection['properties']], + }) + elif connection['relation_name'] == 'PART_OF': + links_part_of.append({ + 'src_id': hosts_id[connection['from_nodeKey']], + 'dst_id': domains_id[connection['to_nodeKey']], + 'props': [self.reference, connection['properties']], + }) + elif connection['relation_name'] == 'ALIAS_OF': + links_alias_of.append({ + 'src_id': hosts_id[connection['from_nodeKey']], + 'dst_id': hosts_id[connection['to_nodeKey']], + 'props': [self.reference, connection['properties']], + }) + elif connection['relation_name'] == 'RESOLVES_TO': + links_resolves_to.append({ + 'src_id': hosts_id[connection['from_nodeKey']], + 'dst_id': ips_id[connection['to_nodeKey']], + 'props': [self.reference, connection['properties']], + }) + else: + logging.error(f'Unknown relationship type: {connection["relation_name"]}') + stop_ts = datetime.now().timestamp() + logging.info(f'{stop_ts - start_ts:.2f}s elapsed') + + # Push all links to IYP + logging.info(f'Pushing {len(links_parent)} PARENT {len(links_part_of)} PART_OF {len(links_alias_of)} ALIAS_OF ' + f'{len(links_managed_by)} MANAGED_BY {len(links_resolves_to)} RESOLVES_TO relationships...') + self.iyp.batch_add_links('PARENT', links_parent) + self.iyp.batch_add_links('PART_OF', links_part_of) + self.iyp.batch_add_links('ALIAS_OF', links_alias_of) + self.iyp.batch_add_links('MANAGED_BY', links_managed_by) + self.iyp.batch_add_links('RESOLVES_TO', links_resolves_to) + + # Push the Authoritative NS Label + ns_id = [link['dst_id'] for link in links_managed_by] + logging.info(f'Adding AuthoritativeNameServer label to {len(ns_id)} nodes') + self.iyp.batch_add_node_label(ns_id, 'AuthoritativeNameServer') diff --git a/iyp/crawlers/openintel/dns_dependency_jp.py b/iyp/crawlers/openintel/dns_dependency_jp.py new file mode 100644 index 0000000..3a3993f --- /dev/null +++ b/iyp/crawlers/openintel/dns_dependency_jp.py @@ -0,0 +1,45 @@ +import argparse +import logging +import os +import sys + +from iyp.crawlers.openintel import DnsDependencyCrawler + +URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/JP' +ORG = 'OpenINTEL' +NAME = 'openintel.dns_dependency_jp' + + +class Crawler(DnsDependencyCrawler): + def __init__(self, organization, url, name): + super().__init__(organization, url, name) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument('--unit-test', action='store_true') + args = parser.parse_args() + + scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3] + FORMAT = '%(asctime)s %(levelname)s %(message)s' + logging.basicConfig( + format=FORMAT, + filename='log/' + scriptname + '.log', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S' + ) + + logging.info(f'Started: {sys.argv}') + + crawler = Crawler(ORG, URL, NAME) + if args.unit_test: + crawler.unit_test(logging) + else: + crawler.run() + crawler.close() + logging.info(f'Finished: {sys.argv}') + + +if __name__ == '__main__': + main() + sys.exit(0) diff --git a/iyp/crawlers/openintel/dns_dependency_nl.py b/iyp/crawlers/openintel/dns_dependency_nl.py new file mode 100644 index 0000000..04c6729 --- /dev/null +++ b/iyp/crawlers/openintel/dns_dependency_nl.py @@ -0,0 +1,45 @@ +import argparse +import logging +import os +import sys + +from iyp.crawlers.openintel import DnsDependencyCrawler + +URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/NL' +ORG = 'OpenINTEL' +NAME = 'openintel.dns_dependency_nl' + + +class Crawler(DnsDependencyCrawler): + def __init__(self, organization, url, name): + super().__init__(organization, url, name) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument('--unit-test', action='store_true') + args = parser.parse_args() + + scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3] + FORMAT = '%(asctime)s %(levelname)s %(message)s' + logging.basicConfig( + format=FORMAT, + filename='log/' + scriptname + '.log', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S' + ) + + logging.info(f'Started: {sys.argv}') + + crawler = Crawler(ORG, URL, NAME) + if args.unit_test: + crawler.unit_test(logging) + else: + crawler.run() + crawler.close() + logging.info(f'Finished: {sys.argv}') + + +if __name__ == '__main__': + main() + sys.exit(0)