From 90418f9f4a4efded6de86e6df2041bb1da73c268 Mon Sep 17 00:00:00 2001 From: ashleyzhang01 <69987606+ashleyzhang01@users.noreply.github.com> Date: Wed, 28 Feb 2024 16:59:56 -0500 Subject: [PATCH] Events scraping (#244) * initial changes from ashley * events type * change models * linting * fix linting * tests * tests * TESTS * format * quotation marks * black * tests * end time * formatting * Lint * :( * oops * Lint * Migration file * Lint and removed facebook field * all day * more pythonic --------- Co-authored-by: vcai122 Co-authored-by: Justin Zhang --- .../commands/get_penn_today_events.py | 141 ++++++++++++++++++ .../migrations/0009_auto_20240223_1820.py | 33 ++++ .../migrations/0010_auto_20240228_0150.py | 43 ++++++ backend/penndata/models.py | 16 +- backend/penndata/serializers.py | 6 +- backend/penndata/urls.py | 3 +- backend/penndata/views.py | 11 +- backend/tests/penndata/test_views.py | 53 ++++--- k8s/main.ts | 8 + 9 files changed, 278 insertions(+), 36 deletions(-) create mode 100644 backend/penndata/management/commands/get_penn_today_events.py create mode 100644 backend/penndata/migrations/0009_auto_20240223_1820.py create mode 100644 backend/penndata/migrations/0010_auto_20240228_0150.py diff --git a/backend/penndata/management/commands/get_penn_today_events.py b/backend/penndata/management/commands/get_penn_today_events.py new file mode 100644 index 00000000..fe774ad4 --- /dev/null +++ b/backend/penndata/management/commands/get_penn_today_events.py @@ -0,0 +1,141 @@ +import datetime +from urllib.parse import urljoin + +from bs4 import BeautifulSoup +from django.core.management.base import BaseCommand +from django.utils import timezone +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait + +from penndata.models import Event + + +PENN_TODAY_WEBSITE = "https://penntoday.upenn.edu/events" +ALL_DAY = "all day" + + +class Command(BaseCommand): + def handle(self, *args, **kwargs): + now = timezone.localtime() + current_month, current_year = now.month, now.year + + # Clears out previous Events + # past_events = Event.objects.filter(end__lt=now.date()) + # past_events.delete() + + # Scrapes Penn Today + try: + driver = webdriver.Chrome() + + driver.get(PENN_TODAY_WEBSITE) + events_list = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.ID, "events-list")) + ) + + html_content = events_list.get_attribute("innerHTML") + driver.quit() + except ConnectionError: + return None + + soup = BeautifulSoup(html_content, "html.parser") + + event_articles = soup.find_all("article", class_="tease") + + for article in event_articles: + name = article.find("h3", class_="tease__head").text.strip() + description = article.find("div", class_="tease__dek").text.strip() + start_date_str = article.find("p", class_="tease__date").text.strip() + meta_elements = article.find_all("p", class_="tease__meta--sm") + if len(meta_elements) >= 2: + start_time_str = meta_elements[0].text.strip().replace(".", "") + location = meta_elements[1].text.strip() + else: + start_time_str = ALL_DAY + location = None + + end_date_elem = article.find( + "p", class_="tease__meta--sm", string=lambda x: "Through" in str(x) + ) + + if start_date_str == "02/29" or start_date_str == "2/29": + # If it's February 29th + start_date = datetime.datetime.strptime("02/28", "%m/%d").replace(year=current_year) + if start_date.month < current_month: + # If scraped month is before current month, increment year + start_date = start_date.replace(year=current_year + 1) + start_date = start_date + datetime.timedelta(days=1) + else: + start_date = datetime.datetime.strptime(start_date_str, "%m/%d").replace( + year=current_year + ) + if start_date.month < current_month: + # If scraped month is before current month, increment year + start_date = start_date.replace(year=current_year + 1) + if start_time_str == ALL_DAY: + start_time = datetime.time(0, 0) + else: + start_time = datetime.datetime.strptime(start_time_str, "%I:%M%p").time() + start_date = datetime.datetime.combine(start_date, start_time) + + event_url = urljoin(PENN_TODAY_WEBSITE, article.find("a", class_="tease__link")["href"]) + + end_time = self.get_end_time(event_url) + if end_time is not None: + if end_date_elem: # end date and end time + end_date_str = end_date_elem.text.strip().split(" ")[-1] + end_date = datetime.datetime.strptime(end_date_str, "%m/%d/%Y") + end_time = datetime.datetime.strptime(end_time, "%I:%M %p").time() + end_date = datetime.datetime.combine(end_date, end_time) + else: # no end date but end time + end_time = datetime.datetime.strptime(end_time, "%I:%M %p").time() + end_date = datetime.datetime.combine(start_date, end_time) + else: + end_of_day = datetime.time(23, 59, 59) + if end_date_elem: # end date but no end time + end_date_str = end_date_elem.text.strip().split(" ")[-1] + end_date = datetime.combine( + datetime.datetime.strptime(end_date_str, "%m/%d/%Y"), end_of_day + ) + else: # no end date or end time + end_date = datetime.combine(start_date, end_of_day) + + Event.objects.update_or_create( + name=name, + defaults={ + "event_type": "", + "image_url": "", + "start": start_date, + "end": end_date, + "location": location, + "website": event_url, + "description": description, + "email": "", + }, + ) + + self.stdout.write("Uploaded Events!") + + def get_end_time(event_url): + driver = webdriver.Chrome() + driver.get(event_url) + event_element = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content")) + ) + end_time_soup = BeautifulSoup(event_element.get_attribute("innerHTML"), "html.parser") + + end_time_range_str = ( + end_time_soup.find("p", class_="event__meta event__time").text.strip().replace(".", "") + ) + print(end_time_range_str) + if not end_time_range_str or ALL_DAY in end_time_range_str.lower(): + driver.quit() + return None # No end time if the event is all day + times = end_time_range_str.split(" - ") + if len(times) <= 1: + driver.quit() + return None + end_time_str = times[1] + driver.quit() + return end_time_str diff --git a/backend/penndata/migrations/0009_auto_20240223_1820.py b/backend/penndata/migrations/0009_auto_20240223_1820.py new file mode 100644 index 00000000..c2259fe5 --- /dev/null +++ b/backend/penndata/migrations/0009_auto_20240223_1820.py @@ -0,0 +1,33 @@ +# Generated by Django 3.2.22 on 2024-02-23 23:20 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("penndata", "0008_calendarevent"), + ] + + operations = [ + migrations.RenameField(model_name="event", old_name="start_time", new_name="start",), + migrations.RemoveField(model_name="event", name="end_time",), + migrations.AddField(model_name="event", name="end", field=models.DateTimeField(null=True),), + migrations.AddField( + model_name="event", name="location", field=models.CharField(max_length=255, null=True), + ), + migrations.AlterField( + model_name="event", name="description", field=models.TextField(null=True), + ), + migrations.AlterField( + model_name="event", name="email", field=models.CharField(max_length=255, null=True), + ), + migrations.AlterField( + model_name="event", + name="event_type", + field=models.CharField(max_length=255, null=True), + ), + migrations.AlterField( + model_name="event", name="image_url", field=models.URLField(null=True), + ), + ] diff --git a/backend/penndata/migrations/0010_auto_20240228_0150.py b/backend/penndata/migrations/0010_auto_20240228_0150.py new file mode 100644 index 00000000..f63fbe6d --- /dev/null +++ b/backend/penndata/migrations/0010_auto_20240228_0150.py @@ -0,0 +1,43 @@ +# Generated by Django 3.2.22 on 2024-02-28 06:50 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("penndata", "0009_auto_20240223_1820"), + ] + + operations = [ + migrations.RemoveField(model_name="event", name="facebook",), + migrations.AlterField( + model_name="event", name="description", field=models.TextField(blank=True, null=True), + ), + migrations.AlterField( + model_name="event", + name="email", + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name="event", name="end", field=models.DateTimeField(blank=True, null=True), + ), + migrations.AlterField( + model_name="event", + name="event_type", + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name="event", name="image_url", field=models.URLField(blank=True, null=True), + ), + migrations.AlterField( + model_name="event", + name="location", + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name="event", + name="website", + field=models.URLField(blank=True, max_length=255, null=True), + ), + ] diff --git a/backend/penndata/models.py b/backend/penndata/models.py index 76709063..9279fb52 100644 --- a/backend/penndata/models.py +++ b/backend/penndata/models.py @@ -9,15 +9,15 @@ class Event(models.Model): - event_type = models.CharField(max_length=255) + event_type = models.CharField(max_length=255, null=True, blank=True) name = models.CharField(max_length=255) - description = models.TextField() - image_url = models.URLField() - start_time = models.DateTimeField() - end_time = models.DateTimeField() - email = models.CharField(max_length=255) - website = models.URLField(max_length=255, null=True) - facebook = models.URLField(max_length=255, null=True) + description = models.TextField(null=True, blank=True) + image_url = models.URLField(null=True, blank=True) + start = models.DateTimeField() + end = models.DateTimeField(null=True, blank=True) + location = models.CharField(max_length=255, null=True, blank=True) + email = models.CharField(max_length=255, null=True, blank=True) + website = models.URLField(max_length=255, null=True, blank=True) class HomePageOrder(models.Model): diff --git a/backend/penndata/serializers.py b/backend/penndata/serializers.py index 3bd15189..e2163eed 100644 --- a/backend/penndata/serializers.py +++ b/backend/penndata/serializers.py @@ -17,12 +17,12 @@ class Meta: "event_type", "name", "description", + "location", "image_url", - "start_time", - "end_time", + "start", + "end", "email", "website", - "facebook", ) diff --git a/backend/penndata/urls.py b/backend/penndata/urls.py index 4f25f012..185567fd 100644 --- a/backend/penndata/urls.py +++ b/backend/penndata/urls.py @@ -18,7 +18,8 @@ path("news/", News.as_view(), name="news"), path("calendar/", Calendar.as_view(), name="calendar"), path("homepage", HomePage.as_view(), name="homepage"), - path("events//", Events.as_view(), name="events"), + path("events/", Events.as_view(), name="events"), + path("events//", Events.as_view(), name="events-type"), path("order/", HomePageOrdering.as_view(), name="home-page-order"), path("fitness/rooms/", FitnessRoomView.as_view(), name="fitness"), path("fitness/usage//", FitnessUsage.as_view(), name="fitness-usage"), diff --git a/backend/penndata/views.py b/backend/penndata/views.py index 4dad8d06..dba81c88 100644 --- a/backend/penndata/views.py +++ b/backend/penndata/views.py @@ -104,7 +104,16 @@ class Events(generics.ListAPIView): serializer_class = EventSerializer def get_queryset(self): - return Event.objects.filter(event_type=self.kwargs.get("type", "")) + queryset = Event.objects.all() + + event_type = self.kwargs.get("type") + if event_type: + queryset = queryset.filter(event_type=event_type) + + queryset = queryset.filter( + end__gte=timezone.localtime(), start__lte=timezone.localtime() + timedelta(days=30) + ) + return queryset class Analytics(generics.CreateAPIView): diff --git a/backend/tests/penndata/test_views.py b/backend/tests/penndata/test_views.py index 572bfd2e..60b0bf83 100644 --- a/backend/tests/penndata/test_views.py +++ b/backend/tests/penndata/test_views.py @@ -56,35 +56,42 @@ def test_response(self): class TestEvent(TestCase): def setUp(self): self.client = APIClient() - Event.objects.create( - event_type="type", - name="test1", - description="asdf", - image_url="https://pennlabs.org/", - start_time=timezone.localtime(), - end_time=timezone.localtime(), - email="a", + self.event1 = Event.objects.create( + event_type="type1", + name="Event 1", + description="Description 1", + start="2024-02-14T10:00:00Z", + end="2099-02-14T12:00:00Z", + location="Location 1", website="https://pennlabs.org/", - facebook="https://pennlabs.org/", ) - Event.objects.create( - event_type="type", - name="test2", - description="asdaf", - image_url="https://pennlabs.org/", - start_time=timezone.localtime(), - end_time=timezone.localtime(), - email="a", + self.event2 = Event.objects.create( + event_type="type2", + name="Event 2", + description="Description 2", + start="2024-02-15T10:00:00Z", + end="2099-02-15T12:00:00Z", + location="Location 2", website="https://pennlabs.org/", - facebook="https://pennlabs.org/", ) - def test_response(self): - response = self.client.get(reverse("events", args=["type"])) + def test_get_all_events(self): + """Test GET request to retrieve all events (no type)""" + url = reverse("events") + response = self.client.get(url) + events = Event.objects.all() + res_json = json.loads(response.content) + self.assertEqual(len(events), len(res_json)) + + def test_get_events_by_type(self): + """Test GET request to retrieve events by type""" + url = reverse("events-type", kwargs={"type": "type1"}) + response = self.client.get(url) + events = Event.objects.filter(event_type="type1") res_json = json.loads(response.content) - self.assertEquals(2, len(res_json)) - self.assertEquals(res_json[0]["name"], "test1") - self.assertEquals(res_json[1]["name"], "test2") + self.assertEqual(len(events), len(res_json)) + event = res_json[0] + self.assertEqual("Event 1", event["name"]) class TestHomePage(TestCase): diff --git a/k8s/main.ts b/k8s/main.ts index a75f3137..9038cb43 100644 --- a/k8s/main.ts +++ b/k8s/main.ts @@ -121,6 +121,14 @@ export class MyChart extends PennLabsChart { cmd: ["python", "manage.py", "get_calendar"], env: [{ name: "DJANGO_SETTINGS_MODULE", value: "pennmobile.settings.production" }] }); + + new CronJob(this, 'get-penn-today-events', { + schedule:'0 15 * * *', // Every day at 3 PM + image: backendImage, + secret, + cmd: ["python", "manage.py", "get_penn_today_events"], + env: [{ name: "DJANGO_SETTINGS_MODULE", value: "pennmobile.settings.production" }] + }); } }