Skip to content

Commit

Permalink
Merge pull request #2704 from tkalir/2703-refactor-extract-geo-features
Browse files Browse the repository at this point in the history
refactoring extract_geo_features
  • Loading branch information
tkalir authored Sep 17, 2024
2 parents 67a5c9b + 46b0b81 commit 0e9a621
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 28 deletions.
59 changes: 34 additions & 25 deletions anyway/parsers/location_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from sqlalchemy.orm import load_only
from datetime import date


INTEGER_FIELDS = ["road1", "road2", "road_segment_id", "yishuv_symbol", "street1", "street2"]


Expand Down Expand Up @@ -516,31 +515,41 @@ def extract_location_text(text):
return text


def extract_geo_features(db, newsflash: NewsFlash, update_cbs_location_only: bool) -> None:
location_from_db = None
if update_cbs_location_only:
if newsflash.resolution in ["כביש בינעירוני", "רחוב"]:
location_from_db = get_db_matching_location(
db, newsflash.lat, newsflash.lon, newsflash.resolution, newsflash.road1
)
else:
newsflash.location = extract_location_text(newsflash.description) or extract_location_text(
newsflash.title
def update_coordinates_and_resolution_using_location_text(newsflash):
newsflash.location = extract_location_text(newsflash.description) or extract_location_text(
newsflash.title
)
geo_location = geocode_extract(newsflash.location)
if geo_location is not None:
newsflash.lat = geo_location["geom"]["lat"]
newsflash.lon = geo_location["geom"]["lng"]
newsflash.road1 = geo_location["road_no"]
newsflash.resolution = set_accident_resolution(geo_location)
return geo_location is not None


def extract_geo_features(db, newsflash: NewsFlash, use_existing_coordinates_only: bool) -> None:
if not use_existing_coordinates_only:
update_coordinates_and_resolution_using_location_text(newsflash)

if newsflash.resolution in BE_CONST.SUPPORTED_RESOLUTIONS:
location_from_db = get_db_matching_location(
db, newsflash.lat, newsflash.lon, newsflash.resolution, newsflash.road1
)
geo_location = geocode_extract(newsflash.location)
if geo_location is not None:
newsflash.lat = geo_location["geom"]["lat"]
newsflash.lon = geo_location["geom"]["lng"]
newsflash.resolution = set_accident_resolution(geo_location)
location_from_db = get_db_matching_location(
db, newsflash.lat, newsflash.lon, newsflash.resolution, geo_location["road_no"]
)
if location_from_db is not None:
for k, v in location_from_db.items():
setattr(newsflash, k, v)
for field in RF.get_all_location_fields():
if field not in location_from_db:
setattr(newsflash, field, None)
if location_from_db is not None:
update_location_fields(newsflash, location_from_db)
try_find_segment_id(newsflash)


def update_location_fields(newsflash, location_from_db):
for k, v in location_from_db.items():
setattr(newsflash, k, v)
for field in RF.get_all_location_fields():
if field not in location_from_db:
setattr(newsflash, field, None)


def try_find_segment_id(newsflash):
if (
newsflash.road_segment_id is None
and newsflash.road_segment_name is not None
Expand Down
6 changes: 3 additions & 3 deletions anyway/parsers/news_flash.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def update_all_in_db(source=None, newsflash_id=None, update_cbs_location_only=Fa
newsflash.accident = classify(newsflash.title)
if newsflash.accident:
extract_geo_features(
db=db, newsflash=newsflash, update_cbs_location_only=update_cbs_location_only
db=db, newsflash=newsflash, use_existing_coordinates_only=update_cbs_location_only
)
if i % 1000 == 0:
db.commit()
Expand All @@ -53,7 +53,7 @@ def scrape_extract_store_rss(site_name, db):
newsflash.organization = classify_organization(site_name)
if newsflash.accident:
# FIX: No accident-accurate date extracted
extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only=False)
extract_geo_features(db=db, newsflash=newsflash, use_existing_coordinates_only=False)
db.insert_new_newsflash(newsflash)


Expand All @@ -66,7 +66,7 @@ def scrape_extract_store_twitter(screen_name, db):
newsflash.accident = classify_tweets(newsflash.description)
newsflash.organization = classify_organization("twitter")
if newsflash.accident:
extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only=False)
extract_geo_features(db=db, newsflash=newsflash, use_existing_coordinates_only=False)
db.insert_new_newsflash(newsflash)


Expand Down

0 comments on commit 0e9a621

Please sign in to comment.