Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: started working on SWE-bench evals #142

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions gptme/eval/swebench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .utils import (
load_instances,
load_instance,
setup_swebench_repo,
get_file_spans_from_patch,
)
from .evaluate import run_swebench_evaluation

__all__ = [
"load_instances",
"load_instance",
"setup_swebench_repo",
"get_file_spans_from_patch",
"run_swebench_evaluation",
]
4 changes: 4 additions & 0 deletions gptme/eval/swebench/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .main import main

if __name__ == "__main__":
main()
122 changes: 122 additions & 0 deletions gptme/eval/swebench/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import json
import logging
import time

from gptme.eval.agents import Agent
from gptme.eval.types import CaseResult, EvalResult

from .utils import get_file_spans_from_patch, load_instances, setup_swebench_repo

logger = logging.getLogger(__name__)


def run_swebench_evaluation(
agent: Agent,
dataset_name: str = "princeton-nlp/SWE-bench_Lite",
split: str = "test",
instance_ids: list[str] | None = None,
repo_base_dir: str | None = None,
) -> list[EvalResult]:
logger.info(
f"Starting SWE-bench evaluation with dataset: {dataset_name}, split: {split}"
)
instances = load_instances(dataset_name, split)

if instance_ids:
logger.info(f"Filtering instances to: {instance_ids}")
instances = {id: instances[id] for id in instance_ids if id in instances}

logger.info(f"Evaluating {len(instances)} instances")

results = []

for instance_id, instance in instances.items():
logger.info(f"Evaluating instance: {instance_id}")
result = evaluate_instance(agent, instance, repo_base_dir)
results.append(result)

logger.info(f"Completed evaluation of {len(results)} instances")
return results


def evaluate_instance(
agent: Agent, instance: dict, repo_base_dir: str | None
) -> EvalResult:
instance_id = instance["instance_id"]
problem_statement = instance["problem_statement"]

logger.info(f"Evaluating instance: {instance_id}")
logger.debug(f"Problem statement: {problem_statement}")

start_time = time.time()
try:
logger.info(f"Executing agent for instance {instance_id}")
repo_dir = setup_swebench_repo(instance, repo_base_dir)
files = agent.act({"repo_dir": repo_dir}, problem_statement)
except Exception as e:
logger.error(f"Error during agent execution for instance {instance_id}: {e}")
return EvalResult(
name=instance_id,
status="error",
results=[],
timings={"gen": time.time() - start_time, "run": 0, "eval": 0},
gen_stdout="",
gen_stderr=str(e),
run_stdout="",
run_stderr="",
)

gen_time = time.time() - start_time
logger.info(
f"Agent execution completed for instance {instance_id} in {gen_time:.2f} seconds"
)

# Evaluate the result
logger.info(f"Evaluating patch for instance {instance_id}")
eval_start = time.time()
diff = str(files.get("diff", ""))
passed = evaluate_patch(instance, diff)
eval_time = time.time() - eval_start

logger.info(f"Evaluation completed for instance {instance_id}. Passed: {passed}")

return EvalResult(
name=instance_id,
status="success",
results=[
CaseResult(name="patch_correctness", passed=passed, duration=eval_time)
],
timings={"gen": gen_time, "run": 0, "eval": eval_time},
gen_stdout="",
gen_stderr="",
run_stdout=diff,
run_stderr="",
)


def evaluate_patch(instance: dict, generated_patch: str) -> bool:
logger.debug(f"Instance keys: {instance.keys()}")
logger.debug(f"Instance content: {json.dumps(instance, indent=2)}")

if "expected_spans" not in instance:
logger.warning(
"'expected_spans' not found in instance data. Using 'patch' instead."
)
expected_patch = instance.get("patch", "")
logger.debug(f"Expected patch: {expected_patch}")
logger.debug(f"Generated patch: {generated_patch}")
return expected_patch.strip() == generated_patch.strip()

expected_spans = instance["expected_spans"]
generated_spans = get_file_spans_from_patch(generated_patch)

logger.debug(f"Expected spans: {expected_spans}")
logger.debug(f"Generated spans: {generated_spans}")

for file_path in expected_spans.keys():
if file_path not in generated_spans:
logger.info(f"File {file_path} not found in generated patch")
return False

logger.info("All expected files found in generated patch")
return True
91 changes: 91 additions & 0 deletions gptme/eval/swebench/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import logging

import click
from gptme.eval.types import EvalResult

from ..agents import GPTMe
from ..main import write_results
from . import run_swebench_evaluation

# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
)
logger = logging.getLogger(__name__)


@click.command()
@click.option(
"--model",
"-m",
multiple=True,
help="Model to use, can be passed multiple times.",
)
@click.option(
"--dataset",
default="princeton-nlp/SWE-bench_Lite",
help="SWE-bench dataset to use",
)
@click.option(
"--split",
default="test",
help="SWE-bench dataset split to use",
)
@click.option(
"--instance",
"-i",
multiple=True,
help="Specific SWE-bench instance IDs to evaluate",
)
@click.option(
"--repo-base-dir",
help="Base directory for repositories",
)
@click.option(
"--verbose",
"-v",
is_flag=True,
help="Increase output verbosity",
)
def main(
model: list[str],
dataset: str,
split: str,
instance: list[str],
repo_base_dir: str,
verbose: bool,
):
"""Run SWE-bench evaluation for gptme."""
if verbose:
logger.setLevel(logging.DEBUG)
logger.debug("Verbose output enabled")

if not model:
model = [
"openai/gpt-4o",
"anthropic/claude-3-5-sonnet-20240620",
]

print("=== Running SWE-bench evaluation ===")
swebench_results = {}
for m in model:
agent = GPTMe(model=m)
results: list[EvalResult] = run_swebench_evaluation(
agent,
dataset_name=dataset,
split=split,
instance_ids=instance if instance else None,
repo_base_dir=repo_base_dir,
)
swebench_results[m] = results

print("\n=== SWE-bench Results ===")
# TODO: Implement custom result printing for SWE-bench

# Write SWE-bench results to CSV
write_results(swebench_results)


if __name__ == "__main__":
main()
89 changes: 89 additions & 0 deletions gptme/eval/swebench/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import os
import logging
import subprocess

from datasets import load_dataset

logger = logging.getLogger(__name__)


from datasets import DownloadMode


def load_instances(
dataset_name: str = "princeton-nlp/SWE-bench_Lite", split: str = "test"
) -> dict[str, dict]:
data = load_dataset(
dataset_name, split=split, download_mode=DownloadMode.FORCE_REDOWNLOAD
)
return {d["instance_id"]: d for d in data}


def load_instance(
instance_id: str,
dataset_name: str = "princeton-nlp/SWE-bench_Lite",
split: str = "test",
) -> dict:
data = load_instances(dataset_name, split=split)
return data[instance_id]


def setup_swebench_repo(instance_data: dict, repo_base_dir: str | None = None) -> str:
if not repo_base_dir:
repo_base_dir = os.getenv("REPO_DIR", "/tmp/repos")

repo_dir_name = instance_data["repo"].replace("/", "__")
github_repo_path = f"swe-bench/{repo_dir_name}"
return setup_github_repo(
repo=github_repo_path,
base_commit=instance_data["base_commit"],
base_dir=repo_base_dir,
)


def get_file_spans_from_patch(patch: str) -> dict[str, list[str]]:
file_spans: dict[str, list[str]] = {}
current_file: str | None = None

for line in patch.split("\n"):
if line.startswith("diff --git"):
current_file = line.split()[-1][2:] # Extract the file path
file_spans[current_file] = []

return file_spans


def setup_github_repo(repo: str, base_commit: str, base_dir: str | None = None) -> str:
if base_dir is None:
base_dir = os.getenv("REPO_DIR", "/tmp/repos")

repo_dir = os.path.join(base_dir, repo.replace("/", "_"))

try:
if not os.path.exists(repo_dir):
logger.info(f"Cloning repository {repo} to {repo_dir}")
os.makedirs(repo_dir, exist_ok=True)
subprocess.run(
["git", "clone", f"https://github.com/{repo}.git", repo_dir],
check=True,
capture_output=True,
text=True,
)

logger.info(f"Checking out commit {base_commit} in {repo_dir}")
os.chdir(repo_dir)
subprocess.run(
["git", "fetch", "origin"], check=True, capture_output=True, text=True
)
subprocess.run(
["git", "checkout", base_commit], check=True, capture_output=True, text=True
)

return repo_dir
except subprocess.CalledProcessError as e:
logger.error(f"Error setting up GitHub repo: {e}")
logger.error(f"Command output: {e.output}")
raise
except Exception as e:
logger.error(f"Unexpected error setting up GitHub repo: {e}")
raise
Loading
Loading