-
Notifications
You must be signed in to change notification settings - Fork 2
/
google_maps_scraping.py
118 lines (94 loc) Β· 3.31 KB
/
google_maps_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from selenium import webdriver
import time
import csv
import requests
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import chromedriver_autoinstaller
import requests
import re
import traceback
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
from openpyxl import load_workbook, Workbook
sheet_name = 'data.xlsx'
# Write Headline and create a new excel sheet
def xl_sheet_headlines(sheet_name=sheet_name):
wb = Workbook()
ws = wb.active
headlines = ['url', 'name', 'address', 'website', 'phone', 'category', 'email']
ws.append(headlines)
wb.save(sheet_name)
xl_sheet_headlines()
# Write Data On existing sheet
def xl_write(data_write, sheet_name=sheet_name):
wb = load_workbook(sheet_name)
work_sheet = wb.active # Get active sheet
work_sheet.append(data_write)
wb.save(sheet_name)
def driver_define():
print('Chromedriver Installing')
driver_path = chromedriver_autoinstaller.install()
print('Chrome Browser Opening')
options = Options()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
s = Service(driver_path)
driver = webdriver.Chrome(service=s, options =options)
return driver
# Email Get
def get_email(url):
domain = url.split('//')[-1].replace('www.', '').split('/')[0]
url_gen = f'http://www.skymem.info/srch?q={domain}'
response = requests.get(url_gen, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
email_list = re.findall(r"href=\"\/srch\?q=(.*?@.*)\">", str(soup))
email = [line for line in email_list if domain in line][0]
return email
driver = driver_define()
urls_filename = 'urls.txt'
urls = [line.strip('\n') for line in open(urls_filename).readlines()]
for url in urls:
driver.get(url)
print('--------------------------')
try:
name = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//h1'))).text
except:
name = 'N/A'
time.sleep(1)
try:
address = driver.find_element(By.XPATH, '//button[@data-item-id="address"]').text
except:
address = 'N/A'
try:
website = driver.find_element(By.CSS_SELECTOR, 'a[aria-label^="Website:"]').get_attribute('href')
except:
website = 'N/A'
try:
phone = driver.find_element(By.CSS_SELECTOR, 'button[aria-label*="Phone:"]').text
except:
phone = 'N/A'
try:
category = driver.find_element(By.CSS_SELECTOR, '[jsaction="pane.rating.category"]').text
except:
category = 'N/A'
email = 'N/A'
try:
if len(website) > 3:
email = get_email(website)
except:
email = 'N/A'
print(f"name : {name}")
print(f"address : {address}")
print(f"website:", website)
print(f"phone:", phone)
print("category:", category)
print("email:", email)
write_data = [url, name, address, website, phone, category, email]
xl_write(write_data)