From 80a7c2c988c1a315b2c5b572fd0b6e62fb1645f0 Mon Sep 17 00:00:00 2001 From: Benjamin Cretois Date: Tue, 19 Mar 2024 09:27:40 +0100 Subject: [PATCH] [ADD] evaluation_all.py for getting mean and std of multiple runs --- evaluate/evaluateDCASE.py | 2 +- evaluate/evaluation_metrics/evaluation_all.py | 298 ++++++++++++++++++ 2 files changed, 299 insertions(+), 1 deletion(-) create mode 100644 evaluate/evaluation_metrics/evaluation_all.py diff --git a/evaluate/evaluateDCASE.py b/evaluate/evaluateDCASE.py index c2d36fc..c5ec8b0 100644 --- a/evaluate/evaluateDCASE.py +++ b/evaluate/evaluateDCASE.py @@ -21,7 +21,7 @@ from datamodules.TestDCASEDataModule import DCASEDataModule, AudioDatasetDCASE import pytorch_lightning as pl -pl.utilities.seed.seed_everything(0, workers=True) +#pl.utilities.seed.seed_everything(0, workers=True) from callbacks.callbacks import MilestonesFinetuning diff --git a/evaluate/evaluation_metrics/evaluation_all.py b/evaluate/evaluation_metrics/evaluation_all.py new file mode 100644 index 0000000..a71a158 --- /dev/null +++ b/evaluate/evaluation_metrics/evaluation_all.py @@ -0,0 +1,298 @@ +import pandas as pd +import argparse +import os +import json +import numpy as np +import csv +import evaluate.evaluation_metrics.metrics as metrics +from datetime import datetime +import copy +from scipy import stats +import glob + +MIN_EVAL_VALUE = 0.00001 +N_SHOTS = 5 +MIN_IOU_TH = 0.3 +PRED_FILE_HEADER = ["Audiofilename","Starttime","Endtime"] +POS_VALUE = 'POS' +UNK_VALUE = 'UNK' + +def remove_shots_from_ref(ref_df, number_shots=5): + + ref_pos_indexes = select_events_with_value(ref_df, value=POS_VALUE) + ref_n_shot_index = ref_pos_indexes[number_shots-1] + # remove all events (pos and UNK) that happen before this 5th event + events_to_drop = ref_df.index[ref_df['Endtime'] <= ref_df.iloc[ref_n_shot_index]['Endtime']].tolist() + + return ref_df.drop(events_to_drop) + +def select_events_with_value(data_frame, value=POS_VALUE): + + indexes_list = data_frame.index[data_frame["Q"] == value].tolist() + + return indexes_list + +def build_matrix_from_selected_rows(data_frame, selected_indexes_list ): + + matrix_data = np.ones((2, len(selected_indexes_list)))* -1 + for n, idx in enumerate(selected_indexes_list): + matrix_data[0, n] = data_frame.loc[idx].Starttime # start time for event n + matrix_data[1, n] = data_frame.loc[idx].Endtime + return matrix_data + + +def compute_tp_fp_fn(pred_events_df, ref_events_df): + # inputs: dataframe with predicted events, dataframe with reference events and their value (POS, UNK, NEG) + # output: True positives, False Positives, False negatives counts and total number of pos events in ref. + + # makes one pass with bipartite graph matching between pred events and ref positive events + # get TP + # make second pass with remaining pred events and ref Unk events + # compute FP as the number of remaining predicted events after the two rounds of matches. + # FN is the remaining unmatched pos events in ref. + + ref_pos_indexes = select_events_with_value(ref_events_df, value=POS_VALUE) + + if "Q" not in pred_events_df.columns: + pred_events_df["Q"] = POS_VALUE + + #sort events by starttime + pred_events_df = pred_events_df.sort_values(by='Starttime', axis=0, ascending=True) + pred_pos_indexes = select_events_with_value(pred_events_df, value=POS_VALUE) + + ref_1st_round = build_matrix_from_selected_rows(ref_events_df, ref_pos_indexes) + pred_1st_round = build_matrix_from_selected_rows(pred_events_df, pred_pos_indexes) + + m_pos = metrics.match_events(ref_1st_round, pred_1st_round, min_iou=MIN_IOU_TH) + matched_ref_indexes = [ri for ri, pi in m_pos] + matched_pred_indexes = [pi for ri, pi in m_pos] + + + ref_unk_indexes = select_events_with_value(ref_events_df, value=UNK_VALUE) + ref_2nd_round = build_matrix_from_selected_rows(ref_events_df, ref_unk_indexes) + + unmatched_pred_events = list(set(range(pred_1st_round.shape[1])) - set(matched_pred_indexes)) + pred_2nd_round = pred_1st_round[:, unmatched_pred_events] + + m_unk = metrics.match_events(ref_2nd_round, pred_2nd_round, min_iou=MIN_IOU_TH) + + # print("# Positive matches between Ref and Pred :", len(m_pos)) + # print("# matches with Unknown events: ", len(m_unk)) + + tp = len(m_pos) + fp = pred_1st_round.shape[1] - tp - len(m_unk) + + ## compute unmatched pos ref events: + count_unmached_pos_ref_events = len(ref_pos_indexes) - tp + + fn = count_unmached_pos_ref_events + + total_n_POS_events = len(ref_pos_indexes) + return tp, fp, fn, total_n_POS_events + +def compute_scores_per_class(counts_per_class): + + scores_per_class = {} + for cl in counts_per_class.keys(): + tp = counts_per_class[cl]["TP"] + fp = counts_per_class[cl]["FP"] + fn = counts_per_class[cl]["FN"] + + + # to compute the harmonic mean we need to have all entries as non zero + precision = tp/(tp+fp) if tp+fp != 0 else MIN_EVAL_VALUE # case where no predictions were made + if precision < MIN_EVAL_VALUE: + precision = MIN_EVAL_VALUE + recall = tp/(fn+tp) if tp != 0 else MIN_EVAL_VALUE + fmeasure = tp/(tp+0.5*(fp+fn)) if tp != 0 else MIN_EVAL_VALUE + + scores_per_class[cl] = {"precision": precision, "recall": recall, "f-measure": fmeasure} + + return scores_per_class + +def compute_scores_from_counts(counts): + tp = counts["TP"] + fp = counts["FP"] + fn = counts["FN"] + + # to compute the harmonic mean we need to have all entries as non zero + precision = tp/(tp+fp) if tp+fp != 0 else MIN_EVAL_VALUE # case where no predictions were made + if precision < MIN_EVAL_VALUE: + precision = MIN_EVAL_VALUE + recall = tp/(fn+tp) if tp != 0 else MIN_EVAL_VALUE + fmeasure = tp/(tp+0.5*(fp+fn)) if tp != 0 else MIN_EVAL_VALUE + + scores = {"precision": precision, "recall": recall, "f-measure": fmeasure} + + return scores + +def evaluate(pred_file_path, ref_file_path, team_name, dataset, savepath, metadata=[]): + + print("\nEvaluation for:", team_name, dataset) + #read Gt file structure: get subsets and paths for ref csvs make an inverted dictionary with audiofilenames as keys and folder as value + gt_file_structure = {} + gt_file_structure[dataset] = {} + inv_gt_file_structure = {} + list_of_subsets = os.listdir(ref_file_path) + for subset in list_of_subsets: + gt_file_structure[dataset][subset] = [os.path.basename(fl)[0:-4]+'.wav' for fl in glob.glob(os.path.join(ref_file_path,subset,"*.csv"))] + for audiofile in gt_file_structure[dataset][subset]: + inv_gt_file_structure[audiofile] = subset + + + #read prediction csv + pred_csv = pd.read_csv(pred_file_path, dtype=str) + #verify headers: + if list(pred_csv.columns) != PRED_FILE_HEADER: + print('Please correct the header of the prediction file. This should be', PRED_FILE_HEADER) + exit(1) + # parse prediction csv + # split file into lists of events for the same audiofile. + pred_events_by_audiofile = dict(tuple(pred_csv.groupby('Audiofilename'))) + + counts_per_audiofile = {} + for audiofilename in list(pred_events_by_audiofile.keys()): + + + # for each audiofile, load correcponding GT File (audiofilename.csv) + ref_events_this_audiofile_all = pd.read_csv(os.path.join(ref_file_path, inv_gt_file_structure[audiofilename], audiofilename[0:-4]+'.csv'), dtype={'Starttime':np.float64, 'Endtime': np.float64}) + # sort events by starttime: + ref_events_this_audiofile_all = ref_events_this_audiofile_all.sort_values(by='Starttime', axis=0, ascending=True) + + #Remove the 5 shots from GT: + ref_events_this_audiofile = remove_shots_from_ref(ref_events_this_audiofile_all, number_shots=N_SHOTS) + + # compare and get counts: TP, FP .. + tp_count, fp_count, fn_count , total_n_events_in_audiofile = compute_tp_fp_fn(pred_events_by_audiofile[audiofilename], ref_events_this_audiofile ) + + counts_per_audiofile[audiofilename] = {"TP": tp_count, "FP": fp_count, "FN": fn_count, "total_n_pos_events": total_n_events_in_audiofile} + print(audiofilename, counts_per_audiofile[audiofilename]) + + if metadata: + # using the key for classes => audiofiles, # load sets metadata: + with open(metadata) as metadatafile: + dataset_metadata = json.load(metadatafile) + else: + dataset_metadata = copy.deepcopy(gt_file_structure) + + # include audiofiles for which there were no predictions: + list_all_audiofiles = [] + for miniset in dataset_metadata[dataset].keys(): + if metadata: + for cl in dataset_metadata[dataset][miniset].keys(): + list_all_audiofiles.extend(dataset_metadata[dataset][miniset][cl] ) + else: + list_all_audiofiles.extend(dataset_metadata[dataset][miniset]) + + for audiofilename in list_all_audiofiles: + if audiofilename not in counts_per_audiofile.keys(): + ref_events_this_audiofile = pd.read_csv(os.path.join(ref_file_path, inv_gt_file_structure[audiofilename], audiofilename[0:-4]+'.csv'), dtype=str) + # sort ref_events by starttime + ref_events_this_audiofile = ref_events_this_audiofile.sort_values(by='Starttime', axis=0, ascending=True) + total_n_pos_events_in_audiofile = len(select_events_with_value(ref_events_this_audiofile, value=POS_VALUE)) + counts_per_audiofile[audiofilename] = {"TP": 0, "FP": 0, "FN": total_n_pos_events_in_audiofile, "total_n_pos_events": total_n_pos_events_in_audiofile} + + + + + # aggregate the counts per class or subset: + list_sets_in_mainset = list(dataset_metadata[dataset].keys()) + + counts_per_class_per_set = {} + scores_per_class_per_set = {} + counts_per_set = {} + scores_per_set = {} + scores_per_audiofile = {} + for data_set in list_sets_in_mainset: + # print(data_set) + + if metadata: + list_classes_in_set = list(dataset_metadata[dataset][data_set].keys()) + + counts_per_class_per_set[data_set] = {} + tp_set = 0 + fn_set = 0 + fp_set = 0 + total_n_events_set = 0 + for cl in list_classes_in_set: + # print(cl) + list_audiofiles_this_class = dataset_metadata[dataset][data_set][cl] + tp = 0 + fn = 0 + fp = 0 + total_n_pos_events_this_class = 0 + for audiofile in list_audiofiles_this_class: + scores_per_audiofile[audiofile] = compute_scores_from_counts(counts_per_audiofile[audiofile]) + + tp = tp + counts_per_audiofile[audiofile]["TP"] + tp_set = tp_set + counts_per_audiofile[audiofile]["TP"] + fn = fn + counts_per_audiofile[audiofile]["FN"] + fn_set = fn_set + counts_per_audiofile[audiofile]["FN"] + fp = fp + counts_per_audiofile[audiofile]["FP"] + fp_set = fp_set + counts_per_audiofile[audiofile]["FP"] + total_n_pos_events_this_class = total_n_pos_events_this_class + counts_per_audiofile[audiofile]["total_n_pos_events"] + total_n_events_set = total_n_events_set + counts_per_audiofile[audiofile]["total_n_pos_events"] + + # counts_per_class[cl] = {"TP":tp, "FN": fn, "FP": fp, "total_n_pos_events_this_class": total_n_pos_events_this_class} + counts_per_class_per_set[data_set][cl] = {"TP": tp, "FN": fn, "FP": fp, "total_n_pos_events_this_class": total_n_pos_events_this_class} + counts_per_set[data_set] = {"TP": tp_set, "FN": fn_set, "FP": fp_set, "total_n_pos_events_this_set": total_n_events_set} + + # compute scores per subset. + scores_per_set[data_set] = compute_scores_from_counts(counts_per_set[data_set]) + # compute scores per class + scores_per_class_per_set[data_set] = compute_scores_per_class(counts_per_class_per_set[data_set]) + + + else: + list_audiofiles_in_set = dataset_metadata[dataset][data_set] + tp = 0 + fn = 0 + fp = 0 + total_n_pos_events_this_set = 0 + for audiofile in list_audiofiles_in_set: + + scores_per_audiofile[audiofile] = compute_scores_from_counts(counts_per_audiofile[audiofile]) + tp = tp + counts_per_audiofile[audiofile]["TP"] + fn = fn + counts_per_audiofile[audiofile]["FN"] + fp = fp + counts_per_audiofile[audiofile]["FP"] + total_n_pos_events_this_set = total_n_pos_events_this_set + counts_per_audiofile[audiofile]["total_n_pos_events"] + counts_per_set[data_set] = {"TP": tp, "FN": fn, "FP": fp, "total_n_pos_events_this_set": total_n_pos_events_this_set} + + # compute scores per subset + scores_per_set[data_set] = compute_scores_from_counts(counts_per_set[data_set]) + + #overall_scores = {"precision" : stats.hmean([scores_per_set[dt]["precision"] for dt in scores_per_set.keys()]), + # "recall": stats.hmean([scores_per_set[dt]["recall"] for dt in scores_per_set.keys()]) , + # "fmeasure (percentage)": np.round(stats.hmean([scores_per_set[dt]["f-measure"] for dt in scores_per_set.keys()])*100, 3) + # } + + #print("\nOverall_scores:", overall_scores) + #print("\nwriting report") + fscore = np.round(stats.hmean([scores_per_set[dt]["f-measure"] for dt in scores_per_set.keys()])*100, 3) + return fscore + +if __name__ == "__main__": + + all_files = glob.glob("/data/DCASEfewshot/validate/d8f698b184e75c3ef4e830f9da4f148071fb4c56/results/beats/models/**/eval_out.csv", + recursive=True) + + l_fscores = [] + + for file in all_files: + + fscore = evaluate(pred_file_path=file, + ref_file_path="/data/DCASE/Development_Set_annotations/Validation_Set", + team_name="whatever", + dataset="VAL", + savepath="/data/.") + + l_fscores.append(fscore) + + fscore_a = np.asarray(l_fscores, dtype=np.float32) + mean_f1 = np.mean(fscore_a) + std_f1 = np.std(fscore_a) + + print(fscore_a) + print(f"MEAN IS {mean_f1}, STD IS {std_f1}") + +# docker run -v $PWD:/app -v /home/benjamin.cretois/data/DCASE/:/data --gpus all beats poetry run python evaluate/evaluation_metrics/evaluation_all.py \ No newline at end of file