-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #116 from Azure-Samples/evaluations
adding evaluate folder.
- Loading branch information
Showing
4 changed files
with
219 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"research_context": "Can you find the latest camping trends and what folks are doing in the winter?", "product_context": "Can you use a selection of tents and sleeping bags as context?", "assignment_context": "Write a fun and engaging article that includes the research and product information. The article should be between 800 and 1000 words."} | ||
{"research_context": "Can you find the latest trends in hiking shoes?" , "product_context":"Can you use a selection of hiking shoes as context?", "assignment_context": "Write an article about the best kind of hiking shoes"} | ||
{"research_context": "Find information about the best snow camping spots in the world","product_context":"Can you use a selection of tents that are good for snow as context?", "assignment_context": "Write an article about the best kind of tents for snow camping"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
# https://github.com/Azure-Samples/contoso-chat/blob/may-2024-updates/evaluations/evaluate-chat-flow-sdk.ipynb | ||
import os | ||
import sys | ||
import json | ||
import concurrent.futures | ||
from pathlib import Path | ||
from datetime import datetime | ||
from promptflow.core import AzureOpenAIModelConfiguration | ||
from promptflow.evals.evaluate import evaluate | ||
from evaluate.evaluators import ArticleEvaluator | ||
from orchestrator import create | ||
|
||
from dotenv import load_dotenv | ||
|
||
load_dotenv() | ||
folder = Path(__file__).parent.absolute().as_posix() | ||
|
||
# # Add the api directory to the sys.path | ||
# sys.path.append(os.path.abspath('../src/api')) | ||
|
||
def evaluate_aistudio(model_config, data_path): | ||
# create unique id for each run with date and time | ||
run_prefix = datetime.now().strftime("%Y%m%d%H%M%S") | ||
run_id = f"{run_prefix}_chat_evaluation_sdk" | ||
print(run_id) | ||
|
||
result = evaluate( | ||
evaluation_name=run_id, | ||
data=data_path, | ||
evaluators={ | ||
"article": ArticleEvaluator(model_config), | ||
}, | ||
evaluator_config={ | ||
"defaults": { | ||
"query": "${data.query}", | ||
"response": "${data.response}", | ||
"context": "${data.context}", | ||
}, | ||
}, | ||
) | ||
return result | ||
|
||
def evaluate_data(model_config, data_path): | ||
writer_evaluator = ArticleEvaluator(model_config) | ||
|
||
data = [] | ||
with open(data_path) as f: | ||
for line in f: | ||
data.append(json.loads(line)) | ||
|
||
results = [] | ||
for row in data: | ||
result = writer_evaluator(query=row["query"], context=row["context"], response=row["response"]) | ||
print("Evaluation results: ", result) | ||
results.append(result) | ||
|
||
return results | ||
|
||
def run_orchestrator(research_context, product_context, assignment_context): | ||
query = {"research_context": research_context, "product_context": product_context, "assignment_context": assignment_context} | ||
context = {} | ||
response = None | ||
|
||
for result in create(research_context, product_context, assignment_context): | ||
if result[0] == "researcher": | ||
context['research'] = result[1] | ||
if result[0] == "products": | ||
context['products'] = result[1] | ||
if result[0] == "writer": | ||
response = result[1] | ||
|
||
return { | ||
"query": json.dumps(query), | ||
"context": json.dumps(context), | ||
"response": json.dumps(response), | ||
} | ||
|
||
def evaluate_orchestrator(model_config, data_path): | ||
writer_evaluator = ArticleEvaluator(model_config) | ||
|
||
data = [] | ||
with open(data_path) as f: | ||
for line in f: | ||
data.append(json.loads(line)) | ||
|
||
eval_data = [] | ||
eval_results = [] | ||
|
||
results = [] | ||
futures = [] | ||
def evaluate_row(research_context, product_context, assignment_context): | ||
result = { "research_context": research_context } | ||
print("Running orchestrator...") | ||
eval_data = run_orchestrator(research_context, product_context, assignment_context) | ||
print("Evaluating results...") | ||
eval_result = writer_evaluator(query=eval_data["query"], context=eval_data["context"], response=eval_data["response"]) | ||
result.update(eval_result) | ||
print("Evaluation results: ", eval_result) | ||
eval_results.append(result) | ||
|
||
with concurrent.futures.ThreadPoolExecutor() as executor: | ||
for row in data: | ||
futures.append(executor.submit(evaluate_row, row["research_context"], row["product_context"], row["assignment_context"])) | ||
for future in futures: | ||
results.append(future.result()) | ||
|
||
# write out eval data to a file so we can re-run evaluation on it | ||
with jsonlines.open(folder + '/eval_data.jsonl', 'w') as writer: | ||
for row in eval_data: | ||
writer.write(row) | ||
|
||
import pandas as pd | ||
|
||
print("Evaluation summary:\n") | ||
results_df = pd.DataFrame.from_dict(eval_results) | ||
print(results_df) | ||
|
||
mean_df = results_df.drop("research_context", axis=1).mean() | ||
print("\nAverage scores:") | ||
print(mean_df) | ||
|
||
results_df.to_markdown(folder + '/eval_results.md') | ||
with open(folder + '/eval_results.md', 'a') as file: | ||
file.write("\n\nAverages scores:\n\n") | ||
mean_df.to_markdown(folder + '/eval_results.md', 'a') | ||
|
||
with jsonlines.open(folder + '/eval_results.jsonl', 'w') as writer: | ||
writer.write(eval_results) | ||
|
||
return eval_results | ||
|
||
if __name__ == "__main__": | ||
import time | ||
import jsonlines | ||
|
||
# Initialize Azure OpenAI Connection | ||
model_config = AzureOpenAIModelConfiguration( | ||
azure_deployment=os.environ["AZURE_OPENAI_4_EVAL_DEPLOYMENT_NAME"], | ||
api_version=os.environ["AZURE_OPENAI_API_VERSION"], | ||
azure_endpoint=f"https://{os.getenv('AZURE_OPENAI_NAME')}.cognitiveservices.azure.com/" | ||
) | ||
|
||
start=time.time() | ||
print(f"Starting evaluate...") | ||
print(os.environ["BING_SEARCH_ENDPOINT"]) | ||
print("value: ", os.environ["BING_SEARCH_KEY"], len(os.environ["BING_SEARCH_KEY"])) | ||
eval_result = evaluate_orchestrator(model_config, data_path=folder +"/eval_inputs.jsonl") | ||
|
||
end=time.time() | ||
print(f"Finished evaluate in {end - start}s") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import os | ||
import json | ||
from threading import Thread | ||
from opentelemetry import trace | ||
from opentelemetry.trace import set_span_in_context | ||
from promptflow.core import AzureOpenAIModelConfiguration | ||
from promptflow.evals.evaluators import RelevanceEvaluator, GroundednessEvaluator, FluencyEvaluator, CoherenceEvaluator | ||
|
||
|
||
class ArticleEvaluator: | ||
def __init__(self, model_config): | ||
self.evaluators = [ | ||
RelevanceEvaluator(model_config), | ||
FluencyEvaluator(model_config), | ||
CoherenceEvaluator(model_config), | ||
GroundednessEvaluator(model_config), | ||
] | ||
|
||
def __call__(self, *, query: str, context: str, response: str, **kwargs): | ||
output = {} | ||
for evaluator in self.evaluators: | ||
result = evaluator( | ||
question=query, | ||
context=context, | ||
answer=response, | ||
) | ||
output.update(result) | ||
return output | ||
|
||
def evaluate_article(data, trace_context): | ||
print("starting offline evals") | ||
|
||
tracer = trace.get_tracer(__name__) | ||
with tracer.start_as_current_span("run_evaluators", context=trace_context) as span: | ||
span.set_attribute("inputs", json.dumps(data)) | ||
configuration = AzureOpenAIModelConfiguration( | ||
azure_deployment=os.environ["AZURE_OPENAI_4_EVAL_DEPLOYMENT_NAME"], | ||
api_version=os.environ["AZURE_OPENAI_API_VERSION"], | ||
azure_endpoint=f"https://{os.getenv('AZURE_OPENAI_NAME')}.cognitiveservices.azure.com/" | ||
) | ||
evaluator = ArticleEvaluator(configuration) | ||
results = evaluator(query=data['query'], context=data['context'], response=data['response']) | ||
resultsJson = json.dumps(results) | ||
span.set_attribute("output", resultsJson) | ||
|
||
print("results: ", resultsJson) | ||
|
||
def evaluate_article_in_background(research_context, product_context, assignment_context, research, products, article): | ||
eval_data = { | ||
"query": json.dumps({ | ||
"research_context": research_context, | ||
"product_context": product_context, | ||
"assignment_context": assignment_context | ||
}), | ||
"context": json.dumps({ | ||
"research": research, | ||
"products": products, | ||
}), | ||
"response": json.dumps(article) | ||
} | ||
|
||
# propagate trace context to the new thread | ||
span = trace.get_current_span() | ||
trace_context = set_span_in_context(span) | ||
thread = Thread(target=evaluate_article, args=(eval_data, trace_context,)) | ||
thread.start() |