-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ADD] evaluation_all.py for getting mean and std of multiple runs
- Loading branch information
1 parent
df52251
commit 80a7c2c
Showing
2 changed files
with
299 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,298 @@ | ||
import pandas as pd | ||
import argparse | ||
import os | ||
import json | ||
import numpy as np | ||
import csv | ||
import evaluate.evaluation_metrics.metrics as metrics | ||
from datetime import datetime | ||
import copy | ||
from scipy import stats | ||
import glob | ||
|
||
MIN_EVAL_VALUE = 0.00001 | ||
N_SHOTS = 5 | ||
MIN_IOU_TH = 0.3 | ||
PRED_FILE_HEADER = ["Audiofilename","Starttime","Endtime"] | ||
POS_VALUE = 'POS' | ||
UNK_VALUE = 'UNK' | ||
|
||
def remove_shots_from_ref(ref_df, number_shots=5): | ||
|
||
ref_pos_indexes = select_events_with_value(ref_df, value=POS_VALUE) | ||
ref_n_shot_index = ref_pos_indexes[number_shots-1] | ||
# remove all events (pos and UNK) that happen before this 5th event | ||
events_to_drop = ref_df.index[ref_df['Endtime'] <= ref_df.iloc[ref_n_shot_index]['Endtime']].tolist() | ||
|
||
return ref_df.drop(events_to_drop) | ||
|
||
def select_events_with_value(data_frame, value=POS_VALUE): | ||
|
||
indexes_list = data_frame.index[data_frame["Q"] == value].tolist() | ||
|
||
return indexes_list | ||
|
||
def build_matrix_from_selected_rows(data_frame, selected_indexes_list ): | ||
|
||
matrix_data = np.ones((2, len(selected_indexes_list)))* -1 | ||
for n, idx in enumerate(selected_indexes_list): | ||
matrix_data[0, n] = data_frame.loc[idx].Starttime # start time for event n | ||
matrix_data[1, n] = data_frame.loc[idx].Endtime | ||
return matrix_data | ||
|
||
|
||
def compute_tp_fp_fn(pred_events_df, ref_events_df): | ||
# inputs: dataframe with predicted events, dataframe with reference events and their value (POS, UNK, NEG) | ||
# output: True positives, False Positives, False negatives counts and total number of pos events in ref. | ||
|
||
# makes one pass with bipartite graph matching between pred events and ref positive events | ||
# get TP | ||
# make second pass with remaining pred events and ref Unk events | ||
# compute FP as the number of remaining predicted events after the two rounds of matches. | ||
# FN is the remaining unmatched pos events in ref. | ||
|
||
ref_pos_indexes = select_events_with_value(ref_events_df, value=POS_VALUE) | ||
|
||
if "Q" not in pred_events_df.columns: | ||
pred_events_df["Q"] = POS_VALUE | ||
|
||
#sort events by starttime | ||
pred_events_df = pred_events_df.sort_values(by='Starttime', axis=0, ascending=True) | ||
pred_pos_indexes = select_events_with_value(pred_events_df, value=POS_VALUE) | ||
|
||
ref_1st_round = build_matrix_from_selected_rows(ref_events_df, ref_pos_indexes) | ||
pred_1st_round = build_matrix_from_selected_rows(pred_events_df, pred_pos_indexes) | ||
|
||
m_pos = metrics.match_events(ref_1st_round, pred_1st_round, min_iou=MIN_IOU_TH) | ||
matched_ref_indexes = [ri for ri, pi in m_pos] | ||
matched_pred_indexes = [pi for ri, pi in m_pos] | ||
|
||
|
||
ref_unk_indexes = select_events_with_value(ref_events_df, value=UNK_VALUE) | ||
ref_2nd_round = build_matrix_from_selected_rows(ref_events_df, ref_unk_indexes) | ||
|
||
unmatched_pred_events = list(set(range(pred_1st_round.shape[1])) - set(matched_pred_indexes)) | ||
pred_2nd_round = pred_1st_round[:, unmatched_pred_events] | ||
|
||
m_unk = metrics.match_events(ref_2nd_round, pred_2nd_round, min_iou=MIN_IOU_TH) | ||
|
||
# print("# Positive matches between Ref and Pred :", len(m_pos)) | ||
# print("# matches with Unknown events: ", len(m_unk)) | ||
|
||
tp = len(m_pos) | ||
fp = pred_1st_round.shape[1] - tp - len(m_unk) | ||
|
||
## compute unmatched pos ref events: | ||
count_unmached_pos_ref_events = len(ref_pos_indexes) - tp | ||
|
||
fn = count_unmached_pos_ref_events | ||
|
||
total_n_POS_events = len(ref_pos_indexes) | ||
return tp, fp, fn, total_n_POS_events | ||
|
||
def compute_scores_per_class(counts_per_class): | ||
|
||
scores_per_class = {} | ||
for cl in counts_per_class.keys(): | ||
tp = counts_per_class[cl]["TP"] | ||
fp = counts_per_class[cl]["FP"] | ||
fn = counts_per_class[cl]["FN"] | ||
|
||
|
||
# to compute the harmonic mean we need to have all entries as non zero | ||
precision = tp/(tp+fp) if tp+fp != 0 else MIN_EVAL_VALUE # case where no predictions were made | ||
if precision < MIN_EVAL_VALUE: | ||
precision = MIN_EVAL_VALUE | ||
recall = tp/(fn+tp) if tp != 0 else MIN_EVAL_VALUE | ||
fmeasure = tp/(tp+0.5*(fp+fn)) if tp != 0 else MIN_EVAL_VALUE | ||
|
||
scores_per_class[cl] = {"precision": precision, "recall": recall, "f-measure": fmeasure} | ||
|
||
return scores_per_class | ||
|
||
def compute_scores_from_counts(counts): | ||
tp = counts["TP"] | ||
fp = counts["FP"] | ||
fn = counts["FN"] | ||
|
||
# to compute the harmonic mean we need to have all entries as non zero | ||
precision = tp/(tp+fp) if tp+fp != 0 else MIN_EVAL_VALUE # case where no predictions were made | ||
if precision < MIN_EVAL_VALUE: | ||
precision = MIN_EVAL_VALUE | ||
recall = tp/(fn+tp) if tp != 0 else MIN_EVAL_VALUE | ||
fmeasure = tp/(tp+0.5*(fp+fn)) if tp != 0 else MIN_EVAL_VALUE | ||
|
||
scores = {"precision": precision, "recall": recall, "f-measure": fmeasure} | ||
|
||
return scores | ||
|
||
def evaluate(pred_file_path, ref_file_path, team_name, dataset, savepath, metadata=[]): | ||
|
||
print("\nEvaluation for:", team_name, dataset) | ||
#read Gt file structure: get subsets and paths for ref csvs make an inverted dictionary with audiofilenames as keys and folder as value | ||
gt_file_structure = {} | ||
gt_file_structure[dataset] = {} | ||
inv_gt_file_structure = {} | ||
list_of_subsets = os.listdir(ref_file_path) | ||
for subset in list_of_subsets: | ||
gt_file_structure[dataset][subset] = [os.path.basename(fl)[0:-4]+'.wav' for fl in glob.glob(os.path.join(ref_file_path,subset,"*.csv"))] | ||
for audiofile in gt_file_structure[dataset][subset]: | ||
inv_gt_file_structure[audiofile] = subset | ||
|
||
|
||
#read prediction csv | ||
pred_csv = pd.read_csv(pred_file_path, dtype=str) | ||
#verify headers: | ||
if list(pred_csv.columns) != PRED_FILE_HEADER: | ||
print('Please correct the header of the prediction file. This should be', PRED_FILE_HEADER) | ||
exit(1) | ||
# parse prediction csv | ||
# split file into lists of events for the same audiofile. | ||
pred_events_by_audiofile = dict(tuple(pred_csv.groupby('Audiofilename'))) | ||
|
||
counts_per_audiofile = {} | ||
for audiofilename in list(pred_events_by_audiofile.keys()): | ||
|
||
|
||
# for each audiofile, load correcponding GT File (audiofilename.csv) | ||
ref_events_this_audiofile_all = pd.read_csv(os.path.join(ref_file_path, inv_gt_file_structure[audiofilename], audiofilename[0:-4]+'.csv'), dtype={'Starttime':np.float64, 'Endtime': np.float64}) | ||
# sort events by starttime: | ||
ref_events_this_audiofile_all = ref_events_this_audiofile_all.sort_values(by='Starttime', axis=0, ascending=True) | ||
|
||
#Remove the 5 shots from GT: | ||
ref_events_this_audiofile = remove_shots_from_ref(ref_events_this_audiofile_all, number_shots=N_SHOTS) | ||
|
||
# compare and get counts: TP, FP .. | ||
tp_count, fp_count, fn_count , total_n_events_in_audiofile = compute_tp_fp_fn(pred_events_by_audiofile[audiofilename], ref_events_this_audiofile ) | ||
|
||
counts_per_audiofile[audiofilename] = {"TP": tp_count, "FP": fp_count, "FN": fn_count, "total_n_pos_events": total_n_events_in_audiofile} | ||
print(audiofilename, counts_per_audiofile[audiofilename]) | ||
|
||
if metadata: | ||
# using the key for classes => audiofiles, # load sets metadata: | ||
with open(metadata) as metadatafile: | ||
dataset_metadata = json.load(metadatafile) | ||
else: | ||
dataset_metadata = copy.deepcopy(gt_file_structure) | ||
|
||
# include audiofiles for which there were no predictions: | ||
list_all_audiofiles = [] | ||
for miniset in dataset_metadata[dataset].keys(): | ||
if metadata: | ||
for cl in dataset_metadata[dataset][miniset].keys(): | ||
list_all_audiofiles.extend(dataset_metadata[dataset][miniset][cl] ) | ||
else: | ||
list_all_audiofiles.extend(dataset_metadata[dataset][miniset]) | ||
|
||
for audiofilename in list_all_audiofiles: | ||
if audiofilename not in counts_per_audiofile.keys(): | ||
ref_events_this_audiofile = pd.read_csv(os.path.join(ref_file_path, inv_gt_file_structure[audiofilename], audiofilename[0:-4]+'.csv'), dtype=str) | ||
# sort ref_events by starttime | ||
ref_events_this_audiofile = ref_events_this_audiofile.sort_values(by='Starttime', axis=0, ascending=True) | ||
total_n_pos_events_in_audiofile = len(select_events_with_value(ref_events_this_audiofile, value=POS_VALUE)) | ||
counts_per_audiofile[audiofilename] = {"TP": 0, "FP": 0, "FN": total_n_pos_events_in_audiofile, "total_n_pos_events": total_n_pos_events_in_audiofile} | ||
|
||
|
||
|
||
|
||
# aggregate the counts per class or subset: | ||
list_sets_in_mainset = list(dataset_metadata[dataset].keys()) | ||
|
||
counts_per_class_per_set = {} | ||
scores_per_class_per_set = {} | ||
counts_per_set = {} | ||
scores_per_set = {} | ||
scores_per_audiofile = {} | ||
for data_set in list_sets_in_mainset: | ||
# print(data_set) | ||
|
||
if metadata: | ||
list_classes_in_set = list(dataset_metadata[dataset][data_set].keys()) | ||
|
||
counts_per_class_per_set[data_set] = {} | ||
tp_set = 0 | ||
fn_set = 0 | ||
fp_set = 0 | ||
total_n_events_set = 0 | ||
for cl in list_classes_in_set: | ||
# print(cl) | ||
list_audiofiles_this_class = dataset_metadata[dataset][data_set][cl] | ||
tp = 0 | ||
fn = 0 | ||
fp = 0 | ||
total_n_pos_events_this_class = 0 | ||
for audiofile in list_audiofiles_this_class: | ||
scores_per_audiofile[audiofile] = compute_scores_from_counts(counts_per_audiofile[audiofile]) | ||
|
||
tp = tp + counts_per_audiofile[audiofile]["TP"] | ||
tp_set = tp_set + counts_per_audiofile[audiofile]["TP"] | ||
fn = fn + counts_per_audiofile[audiofile]["FN"] | ||
fn_set = fn_set + counts_per_audiofile[audiofile]["FN"] | ||
fp = fp + counts_per_audiofile[audiofile]["FP"] | ||
fp_set = fp_set + counts_per_audiofile[audiofile]["FP"] | ||
total_n_pos_events_this_class = total_n_pos_events_this_class + counts_per_audiofile[audiofile]["total_n_pos_events"] | ||
total_n_events_set = total_n_events_set + counts_per_audiofile[audiofile]["total_n_pos_events"] | ||
|
||
# counts_per_class[cl] = {"TP":tp, "FN": fn, "FP": fp, "total_n_pos_events_this_class": total_n_pos_events_this_class} | ||
counts_per_class_per_set[data_set][cl] = {"TP": tp, "FN": fn, "FP": fp, "total_n_pos_events_this_class": total_n_pos_events_this_class} | ||
counts_per_set[data_set] = {"TP": tp_set, "FN": fn_set, "FP": fp_set, "total_n_pos_events_this_set": total_n_events_set} | ||
|
||
# compute scores per subset. | ||
scores_per_set[data_set] = compute_scores_from_counts(counts_per_set[data_set]) | ||
# compute scores per class | ||
scores_per_class_per_set[data_set] = compute_scores_per_class(counts_per_class_per_set[data_set]) | ||
|
||
|
||
else: | ||
list_audiofiles_in_set = dataset_metadata[dataset][data_set] | ||
tp = 0 | ||
fn = 0 | ||
fp = 0 | ||
total_n_pos_events_this_set = 0 | ||
for audiofile in list_audiofiles_in_set: | ||
|
||
scores_per_audiofile[audiofile] = compute_scores_from_counts(counts_per_audiofile[audiofile]) | ||
tp = tp + counts_per_audiofile[audiofile]["TP"] | ||
fn = fn + counts_per_audiofile[audiofile]["FN"] | ||
fp = fp + counts_per_audiofile[audiofile]["FP"] | ||
total_n_pos_events_this_set = total_n_pos_events_this_set + counts_per_audiofile[audiofile]["total_n_pos_events"] | ||
counts_per_set[data_set] = {"TP": tp, "FN": fn, "FP": fp, "total_n_pos_events_this_set": total_n_pos_events_this_set} | ||
|
||
# compute scores per subset | ||
scores_per_set[data_set] = compute_scores_from_counts(counts_per_set[data_set]) | ||
|
||
#overall_scores = {"precision" : stats.hmean([scores_per_set[dt]["precision"] for dt in scores_per_set.keys()]), | ||
# "recall": stats.hmean([scores_per_set[dt]["recall"] for dt in scores_per_set.keys()]) , | ||
# "fmeasure (percentage)": np.round(stats.hmean([scores_per_set[dt]["f-measure"] for dt in scores_per_set.keys()])*100, 3) | ||
# } | ||
|
||
#print("\nOverall_scores:", overall_scores) | ||
#print("\nwriting report") | ||
fscore = np.round(stats.hmean([scores_per_set[dt]["f-measure"] for dt in scores_per_set.keys()])*100, 3) | ||
return fscore | ||
|
||
if __name__ == "__main__": | ||
|
||
all_files = glob.glob("/data/DCASEfewshot/validate/d8f698b184e75c3ef4e830f9da4f148071fb4c56/results/beats/models/**/eval_out.csv", | ||
recursive=True) | ||
|
||
l_fscores = [] | ||
|
||
for file in all_files: | ||
|
||
fscore = evaluate(pred_file_path=file, | ||
ref_file_path="/data/DCASE/Development_Set_annotations/Validation_Set", | ||
team_name="whatever", | ||
dataset="VAL", | ||
savepath="/data/.") | ||
|
||
l_fscores.append(fscore) | ||
|
||
fscore_a = np.asarray(l_fscores, dtype=np.float32) | ||
mean_f1 = np.mean(fscore_a) | ||
std_f1 = np.std(fscore_a) | ||
|
||
print(fscore_a) | ||
print(f"MEAN IS {mean_f1}, STD IS {std_f1}") | ||
|
||
# docker run -v $PWD:/app -v /home/benjamin.cretois/data/DCASE/:/data --gpus all beats poetry run python evaluate/evaluation_metrics/evaluation_all.py |