Skip to content

Commit

Permalink
Ajout de filtres supplémenaires pour être en adequation avec les TD d…
Browse files Browse the repository at this point in the history
…u rapport
  • Loading branch information
Quentin Loridant committed Nov 12, 2024
1 parent ff88a37 commit d6a3874
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 3 deletions.
8 changes: 6 additions & 2 deletions macantine/etl/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ def load_dataset(self):
"""
Load in database
"""
logger.info(f"Loading {len(self.df)} objects in db")
self.warehouse.insert_dataframe(self.df, self.extracted_table_name)

def _clean_dataset(self):
Expand Down Expand Up @@ -230,6 +231,9 @@ def transform_dataset(self):
# Aggregate columns for complete TD - Must occur before other transformations
self.df = aggregate(self.df)

# Add additionnal filters (that couldn't be processed at queryset)
self.df = utils.filter_teledeclarations(self.df)

self.compute_miscellaneous_columns()

# Convert types
Expand All @@ -243,14 +247,14 @@ def transform_dataset(self):
self.fill_geo_names(prefix="canteen.")

# Fill campaign participation
logger.info("Canteens : Fill campaign participations...")
logger.info("TD : Fill campaign participations...")
for year in utils.CAMPAIGN_DATES.keys():
campaign_participation = utils.map_canteens_td(year)
col_name_campaign = f"declaration_{year}"
self.df[col_name_campaign] = self.df["id"].apply(lambda x: x in campaign_participation)

# Extract the sector names and categories
logger.info("Canteens : Extract sectors...")
logger.info("TD : Extract sectors...")
self.df[["secteur", "catégorie"]] = self.df.apply(
lambda x: utils.format_td_sector_column(x, "canteen.sectors"), axis=1, result_type="expand"
)
Expand Down
40 changes: 39 additions & 1 deletion macantine/etl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,33 @@ def map_sectors():
return sectors_mapper


def filter_empty_values(df: pd.DataFrame, col_name) -> pd.DataFrame:
"""
Filtering out the teledeclarations for wich a certain field is empty
"""
return df.dropna(subset=col_name)


def filter_aberrant_td(df: pd.DataFrame) -> pd.DataFrame:
"""
Filtering out the teledeclarations that :
* products > 1 million €
AND
* an avg meal cost > 20 €
"""
mask = (df["teledeclaration.value_total_ht"] > 1000000) & (
df["teledeclaration.value_total_ht"] / df["canteen.yearly_meal_count"] > 20
)
return df[~mask]


def filter_teledeclarations(df: pd.DataFrame):
df = filter_empty_values(df, col_name="teledeclaration.value_total_ht")
df = filter_empty_values(df, col_name="teledeclaration.value_bio_ht")
df = filter_aberrant_td(df)
return df


def fetch_teledeclarations(years: list) -> pd.DataFrame:
df = pd.DataFrame()
for year in years:
Expand All @@ -238,7 +265,18 @@ def fetch_teledeclarations(years: list) -> pd.DataFrame:
),
status=Teledeclaration.TeledeclarationStatus.SUBMITTED,
canteen_id__isnull=False,
).values()
canteen__siret__isnull=False,
canteen__siret__length_gt=14,
diagnostic__value_total_ht__isnull=False,
diagnostic__value_bio_ht__isnull=False,
)
.exclude(
canteen__deletion_date__range=(
CAMPAIGN_DATES[year]["start_date"],
CAMPAIGN_DATES[year]["end_date"],
),
)
.values()
)
df = pd.concat([df, df_year])
else:
Expand Down

0 comments on commit d6a3874

Please sign in to comment.