-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
126 lines (99 loc) · 4.5 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import pathlib
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib import robotparser
class Crawler:
"""
crawler
use for crawling pages from start link
uses BFS to crawl
"""
def __init__(self, start_link, page_limit, crawl_delay, robot_exlusion, test_mode):
self.start_link = start_link
self.page_limit = page_limit
self.robot_exlusion = robot_exlusion
self.test_mode = test_mode
self.url_agent = "*"
self.collection_dir = "data/collection"
self.robot_parser = robotparser.RobotFileParser()
urlparsed = urlparse(self.start_link)
robots_url = urlparsed.scheme + '://' + urlparsed.netloc + '/robots.txt'
self.robot_parser.set_url(robots_url)
self.robot_parser.read()
delay = self.robot_parser.crawl_delay(self.url_agent)
self.crawl_delay = delay if delay is not None else crawl_delay/1000
# Set a list to keep track of the URLs that have been visited
self.visited_links = {}
# Set a list to keep track of the URLs that need to be visited
self.queue = [self.start_link]
def append_to_collection(self, text):
f_path = os.path.join(self.collection_dir, str(len(self.visited_links)) + ".txt")
pathlib.Path(self.collection_dir).mkdir(parents=True, exist_ok=True)
with open(f_path, "w") as f:
f.write(text)
def check_for_index_and_follow(self, soup):
# HTML META tag for robot check
# content NOINDEX/INDEX , FOLLOW/NOFOLLOW
add_to_index, follow_links = True, True
if soup.head.meta.attrs.get("name") == "ROBOTS":
contents = soup.head.meta.attrs.get("content").split(",")
contents = set([c.strip() for c in contents])
if "NOFOLLOW" in contents:
follow_links = False
if "NOINDEX" in contents:
add_to_index = False
return add_to_index, follow_links
def crawl(self):
while len(self.visited_links) < self.page_limit and len(self.queue) > 0:
# print("test-mode", self.test_mode)
# If not running in test mode then add crawl delay to respect robots
if not self.test_mode:
print("delayed")
time.sleep(self.crawl_delay)
# Get the next URL from the queue
url = self.queue.pop(0)
# If robot_exlusion is on then check before crawl
if not self.robot_parser.can_fetch(self.url_agent, url):
continue
# Check if the URL has been visited
if self.visited_links.get(url.split("#")[0],False):
print("already visited skipping")
continue
# Mark the URL as visited
self.visited_links[url] = True
print(f"{url} visited | count {len(self.visited_links)}")
# Send a GET request to the URL
response = requests.get(url)
# Check if the response was successful
if response.status_code != 200:
print("http request error skipping")
continue
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, "html.parser")
# META tag checker
add_to_index, follow_links = self.check_for_index_and_follow(soup)
# META tag specifies NOFOLLOW
if follow_links:
# Find all links on the page
links = soup.find_all("a")
print("followed")
# Loop through each link
for link in links:
# Get the value of the href attribute
href = link.get("href")
# Check if the href value is a relative URL
if not urlparse(href).netloc:
# Convert the relative URL to an absolute URL
href = urljoin(url, href)
# Check if the URL is within the same domain as the starting URL
if urlparse(href).netloc == urlparse(self.start_link).netloc:
# Add the URL to the queue
self.queue.append(href)
# META tag specifies NOINDEX
if add_to_index:
print("indexed")
# Append the text in collection
self.append_to_collection(soup.get_text())