Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Interpolate and group ais #8

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ install_requires =
onc
sqlalchemy
spans
wget
include_package_data = True
zip_safe = False

Expand Down
2 changes: 1 addition & 1 deletion tehom/_persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def _ais_ships_columns():


def init_onc_db(onc_db: Union[Path, str]) -> None:
"""Initializes the local AIS record database, if it does not exist"""
"""Initializes the local ONC record database, if it does not exist"""
eng = _get_engine(onc_db)
md = MetaData(eng)
spans_table = Table("spans", md, *_onc_spans_columns()) # noqa: F841
Expand Down
62 changes: 55 additions & 7 deletions tehom/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from pathlib import Path
from functools import lru_cache

import wget
import pandas as pd
import numpy as np
import spans
Expand All @@ -32,6 +33,9 @@

from . import _persistence

# from zipfile import ZipFile


ais_site = "https://coast.noaa.gov/htdata/CMSP/AISDataHandler/"
onc = ONC(
_persistence.load_user_token(),
Expand All @@ -51,6 +55,8 @@ def download_ships(year: int, month: int, zone: int) -> None:
month (int): month to download
zone (int): UTM zone to download
"""
_persistence._init_data_folder()
_persistence._init_ais_db(_persistence.AIS_DB)
_persistence.init_data_folder()
_persistence.init_ais_db(_persistence.AIS_DB)
if (year, month, zone) not in _get_ais_downloads(_persistence.AIS_DB):
Expand Down Expand Up @@ -80,8 +86,13 @@ def _download_ais_to_temp(year: int, month: int, zone: int) -> Path:
Returns:
location of download result
"""
# morgan
pass
# JMSH: morgan. MWM, 07/23/2021: Done.
url = (
f"https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2015/AIS_{year}_ "
f" {month}_{zone}.zip"
)
wget.download(url, _persistence.AIS_TEMP_DIR)
return _persistence.AIS_TEMP_DIR


def _unzip_ais(zipfile: Path) -> Tuple[Path]:
Expand All @@ -94,7 +105,7 @@ def _unzip_ais(zipfile: Path) -> Tuple[Path]:
tuple comprising the root of the unzip tree and the specific
unzipped file of interest
"""
# morgan
# JMSH: morgan.
pass


Expand Down Expand Up @@ -525,32 +536,69 @@ def _interpolate_and_group_ais(
"""Interpolate the lat/lon of ships to the specified time.

Interpolation rules:
A ship has observations near the specified time, before and
C1: A ship has observations near the specified time, before and
after: linear interpolation
A ship has one observation very near the specified time, either
C2: A ship has one observation very near the specified time, either
before or after, but not both: constant interpolation
A ship does not meet above criteria: do not create an
C3: A ship does not meet above criteria: do not create an
interpolated record for this ship at this time

Note:
What counts as "near" and "very near" is subject to change and
may be refactored out into an interpolation parameters object

MWM: "near" = entries before and after the given time,
"very near" = one entry within a neighborhood of the given time

Comment on lines +550 to +552

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what the point of this is. These aren't definitions, despite the equals sign. These are cases when each term is used.

Arguments:
ais_df: ship records, including a basedatetime column.
times: when to interpolate the ship positions.

Returns:
The interpolated records, grouped by time.
"""
# Morgan, you'll have to first groupby mmsi (ship unique id) and
# 1.) groupby mmsi (ship unique id)
# then apply an interpolation function for each timepoint. The
# interpolation function will take a dataframe and a timepoint,
# and will determine, based on the nearest records before/after
# the timepoint, which interpolation rule to apply.
#
# While this function sounds like it takes a long time, its ok at
# the outset to accomplish this somewhat inefficiently.
mmsi_set = ais_df.groupby["mmsi"]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

groupby is a function, needs parentheses (e.g. groupby("mmsi"))

Also, be careful naming things *_set when they're not a set object. Try mmsi_gb since this is a groupby object.

for mmsi in mmsi_set.groups():
mmsi
# group = mmsi_set.get_group(mmsi)
# intrp = mmsi_set.apply(_ais_interpolator_dispatcher)
pass


def _ais_interpolator_dispatcher(mmsi: pd.DataFrame, time, delta):
"""Assess which case to apply from interpolation rules of
_interpolate_and_group_ais() docstring and dispatch to helper functions.

Arguments:
group_df: single-mmsi subset of ship records, with basedatetime column.
time: when to interpolate the ship positions.
delta: the "very near" threshold
Comment on lines +582 to +583

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to do this for a list of times, so either _interpolate_and_group_ais() needs to apply this to each time, or _ais_interpolator_dispatcher() needs to take a list of times.

Also, you need deltas for the "near" and "very near" thresholds.

"""
time_col = mmsi["BaseDateTime"]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try something like this, but faster:

for target_time in times:
    last_before = max(obs_time - target_time for obs_time in time_col if obs_time < target_time)
    first_after = min(obs_time - target_time for obs_time in time_col if obs_time > target_time)
    if (
        last_before
        and target_time - last_before < near
        and first_after
        and first_after - target_time < near
    ):
        # C1
    elif last_before and target_time - last_before < very_near:
        # C2
    elif first_after and first_after - target_time < very_near:
        # C2

if all(time_col - time < 0): # C1: time after whole record
pass
elif all(time_col - time > 0): # C1: time before whole record
pass
else: # C2 or C3
c2 = (time_col.rsub(time) >= -delta) & (time_col.rsub(time) <= delta)
if c2.sum() > 0: # c2.sum() > 1 not impossible for arbitrary delta
# constant Interpolation
mmsi[c2]
pass
else:
# linear interpolation
# TODO: this assumes chronological ordering of masked subsets
# before = mmsi[time_col - time > 0].iloc[-1]
# after = mmsi[time_col - time < 0].iloc[0]
pass
pass


Expand Down
10 changes: 10 additions & 0 deletions tehom/tests/test_downloads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from tehom import downloads, _persistence


def test_download_ais_to_temp(declare_stateful):
year = 2014
month = 1
zone = 1
downloads._download_ais_to_temp(year, month, zone)
path = _persistence.AIS_TEMP_DIR / f"{year}_{month}_{zone}.zip"
assert path.exists()
Comment on lines +1 to +10

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrong branch