From d870a7b09c337adcedf3b6d2006be379da42563f Mon Sep 17 00:00:00 2001 From: Atalya Alon Date: Sun, 28 Jan 2024 22:17:41 +0200 Subject: [PATCH 1/8] add option to update_cbs_location_only --- anyway/parsers/__init__.py | 2 +- anyway/parsers/location_extraction.py | 25 ++++++++++++++++--------- anyway/parsers/news_flash.py | 16 ++++++++++------ main.py | 5 +++-- 4 files changed, 30 insertions(+), 18 deletions(-) diff --git a/anyway/parsers/__init__.py b/anyway/parsers/__init__.py index c51b66349..da148c4e3 100644 --- a/anyway/parsers/__init__.py +++ b/anyway/parsers/__init__.py @@ -4,7 +4,7 @@ "עיר": ["yishuv_name"], "רחוב": ["yishuv_name", "street1_hebrew"], "צומת עירוני": ["yishuv_name", "street1_hebrew", "street2_hebrew"], - "כביש בינעירוני": ["road1", "road_segment_name"], + "כביש בינעירוני": ["road1", "road_segment_name", "road_segment_id"], "צומת בינעירוני": ["non_urban_intersection", "non_urban_intersection_hebrew", "road1", "road2"], "אחר": [ "region_hebrew", diff --git a/anyway/parsers/location_extraction.py b/anyway/parsers/location_extraction.py index 809ee37e9..d69bb1ec9 100644 --- a/anyway/parsers/location_extraction.py +++ b/anyway/parsers/location_extraction.py @@ -506,18 +506,25 @@ def extract_location_text(text): return text -def extract_geo_features(db, newsflash: NewsFlash) -> None: - newsflash.location = extract_location_text(newsflash.description) or extract_location_text( - newsflash.title - ) - geo_location = geocode_extract(newsflash.location) - if geo_location is not None: - newsflash.lat = geo_location["geom"]["lat"] - newsflash.lon = geo_location["geom"]["lng"] - newsflash.resolution = set_accident_resolution(geo_location) +def extract_geo_features(db, newsflash: NewsFlash, update_cbs_location_only: bool) -> None: + location_from_db = None + if update_cbs_location_only: location_from_db = get_db_matching_location( + db, newsflash.lat, newsflash.lon, newsflash.resolution, geo_location["road_no"] + ) + else: + newsflash.location = extract_location_text(newsflash.description) or extract_location_text( + newsflash.title + ) + geo_location = geocode_extract(newsflash.location) + if geo_location is not None: + newsflash.lat = geo_location["geom"]["lat"] + newsflash.lon = geo_location["geom"]["lng"] + newsflash.resolution = set_accident_resolution(geo_location) + location_from_db = get_db_matching_location( db, newsflash.lat, newsflash.lon, newsflash.resolution, geo_location["road_no"] ) + if location_from_db is not None: for k, v in location_from_db.items(): setattr(newsflash, k, v) all_resolutions = [] diff --git a/anyway/parsers/news_flash.py b/anyway/parsers/news_flash.py index b05a39a1e..070138b73 100644 --- a/anyway/parsers/news_flash.py +++ b/anyway/parsers/news_flash.py @@ -14,7 +14,7 @@ news_flash_classifiers = {"ynet": classify_rss, "twitter": classify_tweets, "walla": classify_rss} -def update_all_in_db(source=None, newsflash_id=None): +def update_all_in_db(source=None, newsflash_id=None, update_cbs_location_only=False): """ main function for newsflash updating. @@ -29,11 +29,15 @@ def update_all_in_db(source=None, newsflash_id=None): newsflash_items = db.get_all_newsflash() for newsflash in newsflash_items: - classify = news_flash_classifiers[newsflash.source] - newsflash.organization = classify_organization(newsflash.source) - newsflash.accident = classify(newsflash.description or newsflash.title) - if newsflash.accident: - extract_geo_features(db, newsflash) + if update_cbs_location_only: + if newsflash.accident: + extract_geo_features(db, newsflash, update_cbs_location_only=True) + else: + classify = news_flash_classifiers[newsflash.source] + newsflash.organization = classify_organization(newsflash.source) + newsflash.accident = classify(newsflash.description or newsflash.title) + if newsflash.accident: + extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only=False) db.commit() diff --git a/main.py b/main.py index 58d66150b..e29df0620 100755 --- a/main.py +++ b/main.py @@ -67,14 +67,15 @@ def update_news_flash(): @update_news_flash.command() @click.option("--source", default="", type=str) @click.option("--news_flash_id", default="", type=str) -def update(source, news_flash_id): +@click.option("--update_cbs_location_only", is_flag=True) +def update(source, news_flash_id, update_cbs_location_only): from anyway.parsers import news_flash if not source: source = None if not news_flash_id: news_flash_id = None - return news_flash.update_all_in_db(source, news_flash_id) + return news_flash.update_all_in_db(source, news_flash_id, update_cbs_location_only) @update_news_flash.command() From 960c29917bece2eed653ca514edc81df10796401 Mon Sep 17 00:00:00 2001 From: Atalya Alon Date: Sun, 28 Jan 2024 22:22:34 +0200 Subject: [PATCH 2/8] black --- anyway/parsers/location_extraction.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/anyway/parsers/location_extraction.py b/anyway/parsers/location_extraction.py index d69bb1ec9..632dbc7fd 100644 --- a/anyway/parsers/location_extraction.py +++ b/anyway/parsers/location_extraction.py @@ -510,8 +510,8 @@ def extract_geo_features(db, newsflash: NewsFlash, update_cbs_location_only: boo location_from_db = None if update_cbs_location_only: location_from_db = get_db_matching_location( - db, newsflash.lat, newsflash.lon, newsflash.resolution, geo_location["road_no"] - ) + db, newsflash.lat, newsflash.lon, newsflash.resolution, geo_location["road_no"] + ) else: newsflash.location = extract_location_text(newsflash.description) or extract_location_text( newsflash.title @@ -522,8 +522,8 @@ def extract_geo_features(db, newsflash: NewsFlash, update_cbs_location_only: boo newsflash.lon = geo_location["geom"]["lng"] newsflash.resolution = set_accident_resolution(geo_location) location_from_db = get_db_matching_location( - db, newsflash.lat, newsflash.lon, newsflash.resolution, geo_location["road_no"] - ) + db, newsflash.lat, newsflash.lon, newsflash.resolution, geo_location["road_no"] + ) if location_from_db is not None: for k, v in location_from_db.items(): setattr(newsflash, k, v) From e58307e5499c9b50a5c8130c5c4c47cfa171dadc Mon Sep 17 00:00:00 2001 From: Atalya Alon Date: Sun, 28 Jan 2024 22:28:13 +0200 Subject: [PATCH 3/8] pylint fix --- anyway/parsers/location_extraction.py | 2 +- anyway/parsers/news_flash.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/anyway/parsers/location_extraction.py b/anyway/parsers/location_extraction.py index 632dbc7fd..c837de3b2 100644 --- a/anyway/parsers/location_extraction.py +++ b/anyway/parsers/location_extraction.py @@ -510,7 +510,7 @@ def extract_geo_features(db, newsflash: NewsFlash, update_cbs_location_only: boo location_from_db = None if update_cbs_location_only: location_from_db = get_db_matching_location( - db, newsflash.lat, newsflash.lon, newsflash.resolution, geo_location["road_no"] + db, newsflash.lat, newsflash.lon, newsflash.resolution, newsflash.road1 ) else: newsflash.location = extract_location_text(newsflash.description) or extract_location_text( diff --git a/anyway/parsers/news_flash.py b/anyway/parsers/news_flash.py index 070138b73..6b7c85bc6 100644 --- a/anyway/parsers/news_flash.py +++ b/anyway/parsers/news_flash.py @@ -51,7 +51,7 @@ def scrape_extract_store_rss(site_name, db): newsflash.organization = classify_organization(site_name) if newsflash.accident: # FIX: No accident-accurate date extracted - extract_geo_features(db, newsflash) + extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only=False) newsflash.set_critical() db.insert_new_newsflash(newsflash) @@ -65,7 +65,7 @@ def scrape_extract_store_twitter(screen_name, db): newsflash.accident = classify_tweets(newsflash.description) newsflash.organization = classify_organization("twitter") if newsflash.accident: - extract_geo_features(db, newsflash) + extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only=False) newsflash.set_critical() db.insert_new_newsflash(newsflash) From f99bfa259ce99e9c2ff33d86fab0281b39e065ff Mon Sep 17 00:00:00 2001 From: Atalya Alon Date: Sun, 28 Jan 2024 22:37:38 +0200 Subject: [PATCH 4/8] revmoe road_segment_id from resolution dict --- anyway/parsers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anyway/parsers/__init__.py b/anyway/parsers/__init__.py index da148c4e3..c51b66349 100644 --- a/anyway/parsers/__init__.py +++ b/anyway/parsers/__init__.py @@ -4,7 +4,7 @@ "עיר": ["yishuv_name"], "רחוב": ["yishuv_name", "street1_hebrew"], "צומת עירוני": ["yishuv_name", "street1_hebrew", "street2_hebrew"], - "כביש בינעירוני": ["road1", "road_segment_name", "road_segment_id"], + "כביש בינעירוני": ["road1", "road_segment_name"], "צומת בינעירוני": ["non_urban_intersection", "non_urban_intersection_hebrew", "road1", "road2"], "אחר": [ "region_hebrew", From 6a86b38abb2914609c0d94f0884be47e1991c701 Mon Sep 17 00:00:00 2001 From: Atalya Alon <20992625+atalyaalon@users.noreply.github.com> Date: Wed, 14 Feb 2024 17:36:23 +0200 Subject: [PATCH 5/8] Update news_flash.py --- anyway/parsers/news_flash.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/anyway/parsers/news_flash.py b/anyway/parsers/news_flash.py index 6b7c85bc6..d98260a95 100644 --- a/anyway/parsers/news_flash.py +++ b/anyway/parsers/news_flash.py @@ -29,15 +29,12 @@ def update_all_in_db(source=None, newsflash_id=None, update_cbs_location_only=Fa newsflash_items = db.get_all_newsflash() for newsflash in newsflash_items: - if update_cbs_location_only: - if newsflash.accident: - extract_geo_features(db, newsflash, update_cbs_location_only=True) - else: + if not update_cbs_location_only: classify = news_flash_classifiers[newsflash.source] newsflash.organization = classify_organization(newsflash.source) newsflash.accident = classify(newsflash.description or newsflash.title) - if newsflash.accident: - extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only=False) + if newsflash.accident: + extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only) db.commit() From d610694beaf8328847defb8a3b7cddf6fb850f9a Mon Sep 17 00:00:00 2001 From: Atalya Alon <20992625+atalyaalon@users.noreply.github.com> Date: Wed, 14 Feb 2024 17:40:14 +0200 Subject: [PATCH 6/8] Update news_flash.py --- anyway/parsers/news_flash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anyway/parsers/news_flash.py b/anyway/parsers/news_flash.py index d98260a95..0807d921a 100644 --- a/anyway/parsers/news_flash.py +++ b/anyway/parsers/news_flash.py @@ -34,7 +34,7 @@ def update_all_in_db(source=None, newsflash_id=None, update_cbs_location_only=Fa newsflash.organization = classify_organization(newsflash.source) newsflash.accident = classify(newsflash.description or newsflash.title) if newsflash.accident: - extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only) + extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only=update_cbs_location_only) db.commit() From 26d662f4a4b05200800a71d3571fcb6a2826d325 Mon Sep 17 00:00:00 2001 From: Atalya Alon Date: Wed, 14 Feb 2024 17:44:12 +0200 Subject: [PATCH 7/8] black fix --- anyway/parsers/news_flash.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/anyway/parsers/news_flash.py b/anyway/parsers/news_flash.py index 0807d921a..15a0334a9 100644 --- a/anyway/parsers/news_flash.py +++ b/anyway/parsers/news_flash.py @@ -34,7 +34,9 @@ def update_all_in_db(source=None, newsflash_id=None, update_cbs_location_only=Fa newsflash.organization = classify_organization(newsflash.source) newsflash.accident = classify(newsflash.description or newsflash.title) if newsflash.accident: - extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only=update_cbs_location_only) + extract_geo_features( + db=db, newsflash=newsflash, update_cbs_location_only=update_cbs_location_only + ) db.commit() From 89a8d1a12b829adb2d4343db5916828f076e5c53 Mon Sep 17 00:00:00 2001 From: Atalya Alon Date: Wed, 14 Feb 2024 17:48:20 +0200 Subject: [PATCH 8/8] fetch newsflash in desc order --- anyway/parsers/news_flash_db_adapter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anyway/parsers/news_flash_db_adapter.py b/anyway/parsers/news_flash_db_adapter.py index a25e6195b..0e6c88ca4 100644 --- a/anyway/parsers/news_flash_db_adapter.py +++ b/anyway/parsers/news_flash_db_adapter.py @@ -3,6 +3,7 @@ import logging import pandas as pd import numpy as np +from sqlalchemy import desc from flask_sqlalchemy import SQLAlchemy from anyway.parsers import infographics_data_cache_updater from anyway.parsers import timezones @@ -111,7 +112,7 @@ def select_newsflash_where_source(self, source): return self.db.session.query(NewsFlash).filter(NewsFlash.source == source) def get_all_newsflash(self): - return self.db.session.query(NewsFlash) + return self.db.session.query(NewsFlash).order_by(desc(NewsFlash.date)) def get_latest_date_of_source(self, source): """