Skip to content

Commit

Permalink
Export de données : Renommer table data warehouse et nettoyer les col…
Browse files Browse the repository at this point in the history
…onnes dupliquées (#4631)

Co-authored-by: Quentin Loridant <[email protected]>
  • Loading branch information
qloridant and Quentin Loridant authored Nov 12, 2024
1 parent d66e46c commit cf524a2
Showing 1 changed file with 11 additions and 4 deletions.
15 changes: 11 additions & 4 deletions macantine/etl/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,11 @@ def load_dataset(self):
"""
self.warehouse.insert_dataframe(self.df, self.extracted_table_name)

def _clean_dataset(self):
self.df = self.df.loc[:, ~self.df.columns.duplicated()]
self.df = utils.filter_dataframe_with_schema_cols(self.df, self.schema)
self.df = self.df.drop_duplicates(subset=["id"])


class ETL_ANALYSIS_TD(ETL_ANALYSIS):
"""
Expand All @@ -204,7 +209,7 @@ class ETL_ANALYSIS_TD(ETL_ANALYSIS):
def __init__(self):
self.df = None
self.years = utils.CAMPAIGN_DATES.keys()
self.extracted_table_name = "teledeclarations_extracted"
self.extracted_table_name = "teledeclarations"
self.warehouse = DataWareHouse()
self.schema = json.load(open("data/schemas/schema_analysis.json"))

Expand Down Expand Up @@ -302,7 +307,7 @@ class ETL_ANALYSIS_CANTEEN(ETL_ANALYSIS):

def __init__(self):
self.df = None
self.extracted_table_name = "canteens_extracted"
self.extracted_table_name = "canteens"
self.warehouse = DataWareHouse()
self.schema = json.load(open("data/schemas/schema_analysis_cantines.json"))
# The following mapper is used for renaming columns and for selecting the columns to extract from db
Expand Down Expand Up @@ -338,7 +343,9 @@ def transform_dataset(self):
# Extract the sector names and categories
logger.info("Canteens : Extract sectors and SPE...")
self.df = utils.extract_sectors(self.df, extract_spe=True, split_category_and_sector=True, only_one_value=True)
self.df = self.df.rename(columns={"categories": "categorie"})

self.df = self.df.rename(columns={"categories": "categorie"})
self.df = self.df.rename(columns=self.columns_mapper)
self.df = utils.filter_dataframe_with_schema_cols(self.df, self.schema)

logger.info("Canteens : Clean dataset")
self._clean_dataset()

0 comments on commit cf524a2

Please sign in to comment.