Skip to content

Commit

Permalink
Update DNS dependency crawler
Browse files Browse the repository at this point in the history
Use only the connections file to calculate nodes and realtionships.
  • Loading branch information
m-appel committed Feb 15, 2024
1 parent 5ba18d1 commit 5abddd7
Showing 1 changed file with 10 additions and 16 deletions.
26 changes: 10 additions & 16 deletions iyp/crawlers/openintel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,44 +273,41 @@ def run(self):
year = current_date.strftime('%Y')
week = current_date.strftime('%U')
base_url = f'{self.reference["reference_url"]}/year={year}/week={week}'
probe_url = f'{base_url}/domain_nodes.json.gz'
probe_url = f'{base_url}/connections.json.gz'
if requests.head(probe_url).ok:
logging.info(f'Using year={year}/week={week} ({current_date.strftime("%Y-%m-%d")})')
break
else:
logging.error('Failed to find data within the specified lookback interval.')
raise RequestStatusError('Failed to find data within the specified lookback interval.')

logging.info('Reading domain names')
domains = pd.read_json(f'{base_url}/domain_nodes.json.gz', lines=True)
logging.info('Reading host names')
hosts = pd.read_json(f'{base_url}/host_nodes.json.gz', lines=True)
logging.info('Reading IPs')
ips = pd.read_json(f'{base_url}/ip_nodes.json.gz', lines=True)
logging.info('Reading connections')
connections = pd.read_json(f'{base_url}/connections.json.gz', lines=True)

logging.info('Stripping root "." and normalizing IPs')
# Remove root "." from names that are not the root.
domains['name'] = domains['name'].map(self.remove_root)
hosts['name'] = hosts['name'].map(self.remove_root)
# Currently there are only DOMAIN and HOSTNAME entries in from_nodeType, but
# maybe that changes in the future.
connections.loc[connections['from_nodeType'].isin(('DOMAIN', 'HOSTNAME')), 'from_nodeKey'] = \
connections.loc[connections['from_nodeType'].isin(('DOMAIN', 'HOSTNAME')), 'from_nodeKey'].map(self.remove_root)
connections.loc[connections['to_nodeType'].isin(('DOMAIN', 'HOSTNAME')), 'to_nodeKey'] = \
connections.loc[connections['to_nodeType'].isin(('DOMAIN', 'HOSTNAME')), 'to_nodeKey'].map(self.remove_root)
# Normalize IPv6 addresses.
ips['address'] = ips['address'].map(self.normalize_ipv6)
connections.loc[connections['from_nodeType'] == 'IP', 'from_nodeKey'] = \
connections.loc[connections['from_nodeType'] == 'IP', 'from_nodeKey'].map(self.normalize_ipv6)
connections.loc[connections['to_nodeType'] == 'IP', 'to_nodeKey'] = \
connections.loc[connections['to_nodeType'] == 'IP', 'to_nodeKey'].map(self.normalize_ipv6)

# Pandas' unique is faster than plain set.
unique_domain_names = set(domains['name'].unique())
unique_host_names = set(hosts['name'].unique())
unique_ips = set(ips['address'].unique())
unique_domain_names = set()
unique_host_names = set()
unique_ips = set()
logging.info('Getting unique nodes')
for node_type, node_key in [('from_nodeType', 'from_nodeKey'), ('to_nodeType', 'to_nodeKey')]:
unique_domain_names.update(connections[connections[node_type] == 'DOMAIN'][node_key].unique())
unique_host_names.update(connections[connections[node_type] == 'HOSTNAME'][node_key].unique())
unique_ips.update(connections[connections[node_type] == 'IP'][node_key].unique())

logging.info(f'Pushing/getting {len(unique_domain_names)} DomainName {len(unique_host_names)} HostName '
f'{len(unique_ips)} IP nodes...')
domains_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', unique_domain_names)
Expand All @@ -325,7 +322,6 @@ def run(self):
unique_relationships = set()

logging.info('Computing relationships...')
start_ts = datetime.now().timestamp()
for connection in connections.itertuples():
relationship_tuple = (connection.relation_name,
connection.from_nodeType,
Expand Down Expand Up @@ -368,8 +364,6 @@ def run(self):
})
else:
logging.error(f'Unknown relationship type: {connection.relation_name}')
stop_ts = datetime.now().timestamp()
logging.info(f'{stop_ts - start_ts:.2f}s elapsed')

# Push all links to IYP
logging.info(f'Pushing {len(links_parent)} PARENT {len(links_part_of)} PART_OF {len(links_alias_of)} ALIAS_OF '
Expand Down

0 comments on commit 5abddd7

Please sign in to comment.