forked from ScrapeGraphAI/Scrapegraph-ai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_data.py
27 lines (21 loc) · 1000 Bytes
/
extract_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def extract_data(html: str) -> dict:
from bs4 import BeautifulSoup
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Initialize an empty list to hold project data
projects = []
# Find all project entries in the HTML
project_entries = soup.find_all('div', class_='grid-item')
# Iterate over each project entry to extract title and description
for entry in project_entries:
# Extract the title from the h4 element
title = entry.find('h4', class_='card-title').get_text(strip=True)
# Extract the description from the p element
description = entry.find('p', class_='card-text').get_text(strip=True)
# Append the extracted data as a dictionary to the projects list
projects.append({
'title': title,
'description': description
})
# Return the structured data as a dictionary matching the desired JSON schema
return {'projects': projects}