Skip to content

Commit

Permalink
Events scraping (#244)
Browse files Browse the repository at this point in the history
* initial changes from ashley

* events type

* change models

* linting

* fix linting

* tests

* tests

* TESTS

* format

* quotation marks

* black

* tests

* end time

* formatting

* Lint

* :(

* oops

* Lint

* Migration file

* Lint and removed facebook field

* all day

* more pythonic

---------

Co-authored-by: vcai122 <[email protected]>
Co-authored-by: Justin Zhang <[email protected]>
  • Loading branch information
3 people authored Feb 28, 2024
1 parent c5e7088 commit 90418f9
Show file tree
Hide file tree
Showing 9 changed files with 278 additions and 36 deletions.
141 changes: 141 additions & 0 deletions backend/penndata/management/commands/get_penn_today_events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import datetime
from urllib.parse import urljoin

from bs4 import BeautifulSoup
from django.core.management.base import BaseCommand
from django.utils import timezone
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from penndata.models import Event


PENN_TODAY_WEBSITE = "https://penntoday.upenn.edu/events"
ALL_DAY = "all day"


class Command(BaseCommand):
def handle(self, *args, **kwargs):
now = timezone.localtime()
current_month, current_year = now.month, now.year

# Clears out previous Events
# past_events = Event.objects.filter(end__lt=now.date())
# past_events.delete()

# Scrapes Penn Today
try:
driver = webdriver.Chrome()

driver.get(PENN_TODAY_WEBSITE)
events_list = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "events-list"))
)

html_content = events_list.get_attribute("innerHTML")
driver.quit()
except ConnectionError:
return None

soup = BeautifulSoup(html_content, "html.parser")

event_articles = soup.find_all("article", class_="tease")

for article in event_articles:
name = article.find("h3", class_="tease__head").text.strip()
description = article.find("div", class_="tease__dek").text.strip()
start_date_str = article.find("p", class_="tease__date").text.strip()
meta_elements = article.find_all("p", class_="tease__meta--sm")
if len(meta_elements) >= 2:
start_time_str = meta_elements[0].text.strip().replace(".", "")
location = meta_elements[1].text.strip()
else:
start_time_str = ALL_DAY
location = None

end_date_elem = article.find(
"p", class_="tease__meta--sm", string=lambda x: "Through" in str(x)
)

if start_date_str == "02/29" or start_date_str == "2/29":
# If it's February 29th
start_date = datetime.datetime.strptime("02/28", "%m/%d").replace(year=current_year)
if start_date.month < current_month:
# If scraped month is before current month, increment year
start_date = start_date.replace(year=current_year + 1)
start_date = start_date + datetime.timedelta(days=1)
else:
start_date = datetime.datetime.strptime(start_date_str, "%m/%d").replace(
year=current_year
)
if start_date.month < current_month:
# If scraped month is before current month, increment year
start_date = start_date.replace(year=current_year + 1)
if start_time_str == ALL_DAY:
start_time = datetime.time(0, 0)
else:
start_time = datetime.datetime.strptime(start_time_str, "%I:%M%p").time()
start_date = datetime.datetime.combine(start_date, start_time)

event_url = urljoin(PENN_TODAY_WEBSITE, article.find("a", class_="tease__link")["href"])

end_time = self.get_end_time(event_url)
if end_time is not None:
if end_date_elem: # end date and end time
end_date_str = end_date_elem.text.strip().split(" ")[-1]
end_date = datetime.datetime.strptime(end_date_str, "%m/%d/%Y")
end_time = datetime.datetime.strptime(end_time, "%I:%M %p").time()
end_date = datetime.datetime.combine(end_date, end_time)
else: # no end date but end time
end_time = datetime.datetime.strptime(end_time, "%I:%M %p").time()
end_date = datetime.datetime.combine(start_date, end_time)
else:
end_of_day = datetime.time(23, 59, 59)
if end_date_elem: # end date but no end time
end_date_str = end_date_elem.text.strip().split(" ")[-1]
end_date = datetime.combine(
datetime.datetime.strptime(end_date_str, "%m/%d/%Y"), end_of_day
)
else: # no end date or end time
end_date = datetime.combine(start_date, end_of_day)

Event.objects.update_or_create(
name=name,
defaults={
"event_type": "",
"image_url": "",
"start": start_date,
"end": end_date,
"location": location,
"website": event_url,
"description": description,
"email": "",
},
)

self.stdout.write("Uploaded Events!")

def get_end_time(event_url):
driver = webdriver.Chrome()
driver.get(event_url)
event_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content"))
)
end_time_soup = BeautifulSoup(event_element.get_attribute("innerHTML"), "html.parser")

end_time_range_str = (
end_time_soup.find("p", class_="event__meta event__time").text.strip().replace(".", "")
)
print(end_time_range_str)
if not end_time_range_str or ALL_DAY in end_time_range_str.lower():
driver.quit()
return None # No end time if the event is all day
times = end_time_range_str.split(" - ")
if len(times) <= 1:
driver.quit()
return None
end_time_str = times[1]
driver.quit()
return end_time_str
33 changes: 33 additions & 0 deletions backend/penndata/migrations/0009_auto_20240223_1820.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 3.2.22 on 2024-02-23 23:20

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("penndata", "0008_calendarevent"),
]

operations = [
migrations.RenameField(model_name="event", old_name="start_time", new_name="start",),
migrations.RemoveField(model_name="event", name="end_time",),
migrations.AddField(model_name="event", name="end", field=models.DateTimeField(null=True),),
migrations.AddField(
model_name="event", name="location", field=models.CharField(max_length=255, null=True),
),
migrations.AlterField(
model_name="event", name="description", field=models.TextField(null=True),
),
migrations.AlterField(
model_name="event", name="email", field=models.CharField(max_length=255, null=True),
),
migrations.AlterField(
model_name="event",
name="event_type",
field=models.CharField(max_length=255, null=True),
),
migrations.AlterField(
model_name="event", name="image_url", field=models.URLField(null=True),
),
]
43 changes: 43 additions & 0 deletions backend/penndata/migrations/0010_auto_20240228_0150.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Generated by Django 3.2.22 on 2024-02-28 06:50

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("penndata", "0009_auto_20240223_1820"),
]

operations = [
migrations.RemoveField(model_name="event", name="facebook",),
migrations.AlterField(
model_name="event", name="description", field=models.TextField(blank=True, null=True),
),
migrations.AlterField(
model_name="event",
name="email",
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AlterField(
model_name="event", name="end", field=models.DateTimeField(blank=True, null=True),
),
migrations.AlterField(
model_name="event",
name="event_type",
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AlterField(
model_name="event", name="image_url", field=models.URLField(blank=True, null=True),
),
migrations.AlterField(
model_name="event",
name="location",
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AlterField(
model_name="event",
name="website",
field=models.URLField(blank=True, max_length=255, null=True),
),
]
16 changes: 8 additions & 8 deletions backend/penndata/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@


class Event(models.Model):
event_type = models.CharField(max_length=255)
event_type = models.CharField(max_length=255, null=True, blank=True)
name = models.CharField(max_length=255)
description = models.TextField()
image_url = models.URLField()
start_time = models.DateTimeField()
end_time = models.DateTimeField()
email = models.CharField(max_length=255)
website = models.URLField(max_length=255, null=True)
facebook = models.URLField(max_length=255, null=True)
description = models.TextField(null=True, blank=True)
image_url = models.URLField(null=True, blank=True)
start = models.DateTimeField()
end = models.DateTimeField(null=True, blank=True)
location = models.CharField(max_length=255, null=True, blank=True)
email = models.CharField(max_length=255, null=True, blank=True)
website = models.URLField(max_length=255, null=True, blank=True)


class HomePageOrder(models.Model):
Expand Down
6 changes: 3 additions & 3 deletions backend/penndata/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ class Meta:
"event_type",
"name",
"description",
"location",
"image_url",
"start_time",
"end_time",
"start",
"end",
"email",
"website",
"facebook",
)


Expand Down
3 changes: 2 additions & 1 deletion backend/penndata/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
path("news/", News.as_view(), name="news"),
path("calendar/", Calendar.as_view(), name="calendar"),
path("homepage", HomePage.as_view(), name="homepage"),
path("events/<type>/", Events.as_view(), name="events"),
path("events/", Events.as_view(), name="events"),
path("events/<str:type>/", Events.as_view(), name="events-type"),
path("order/", HomePageOrdering.as_view(), name="home-page-order"),
path("fitness/rooms/", FitnessRoomView.as_view(), name="fitness"),
path("fitness/usage/<room_id>/", FitnessUsage.as_view(), name="fitness-usage"),
Expand Down
11 changes: 10 additions & 1 deletion backend/penndata/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,16 @@ class Events(generics.ListAPIView):
serializer_class = EventSerializer

def get_queryset(self):
return Event.objects.filter(event_type=self.kwargs.get("type", ""))
queryset = Event.objects.all()

event_type = self.kwargs.get("type")
if event_type:
queryset = queryset.filter(event_type=event_type)

queryset = queryset.filter(
end__gte=timezone.localtime(), start__lte=timezone.localtime() + timedelta(days=30)
)
return queryset


class Analytics(generics.CreateAPIView):
Expand Down
53 changes: 30 additions & 23 deletions backend/tests/penndata/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,35 +56,42 @@ def test_response(self):
class TestEvent(TestCase):
def setUp(self):
self.client = APIClient()
Event.objects.create(
event_type="type",
name="test1",
description="asdf",
image_url="https://pennlabs.org/",
start_time=timezone.localtime(),
end_time=timezone.localtime(),
email="a",
self.event1 = Event.objects.create(
event_type="type1",
name="Event 1",
description="Description 1",
start="2024-02-14T10:00:00Z",
end="2099-02-14T12:00:00Z",
location="Location 1",
website="https://pennlabs.org/",
facebook="https://pennlabs.org/",
)
Event.objects.create(
event_type="type",
name="test2",
description="asdaf",
image_url="https://pennlabs.org/",
start_time=timezone.localtime(),
end_time=timezone.localtime(),
email="a",
self.event2 = Event.objects.create(
event_type="type2",
name="Event 2",
description="Description 2",
start="2024-02-15T10:00:00Z",
end="2099-02-15T12:00:00Z",
location="Location 2",
website="https://pennlabs.org/",
facebook="https://pennlabs.org/",
)

def test_response(self):
response = self.client.get(reverse("events", args=["type"]))
def test_get_all_events(self):
"""Test GET request to retrieve all events (no type)"""
url = reverse("events")
response = self.client.get(url)
events = Event.objects.all()
res_json = json.loads(response.content)
self.assertEqual(len(events), len(res_json))

def test_get_events_by_type(self):
"""Test GET request to retrieve events by type"""
url = reverse("events-type", kwargs={"type": "type1"})
response = self.client.get(url)
events = Event.objects.filter(event_type="type1")
res_json = json.loads(response.content)
self.assertEquals(2, len(res_json))
self.assertEquals(res_json[0]["name"], "test1")
self.assertEquals(res_json[1]["name"], "test2")
self.assertEqual(len(events), len(res_json))
event = res_json[0]
self.assertEqual("Event 1", event["name"])


class TestHomePage(TestCase):
Expand Down
8 changes: 8 additions & 0 deletions k8s/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,14 @@ export class MyChart extends PennLabsChart {
cmd: ["python", "manage.py", "get_calendar"],
env: [{ name: "DJANGO_SETTINGS_MODULE", value: "pennmobile.settings.production" }]
});

new CronJob(this, 'get-penn-today-events', {
schedule:'0 15 * * *', // Every day at 3 PM
image: backendImage,
secret,
cmd: ["python", "manage.py", "get_penn_today_events"],
env: [{ name: "DJANGO_SETTINGS_MODULE", value: "pennmobile.settings.production" }]
});
}
}

Expand Down

0 comments on commit 90418f9

Please sign in to comment.