BYU-PCCL · chrisrytting · May 12, 2023 · May 12, 2023 · May 16, 2023 · May 16, 2023
diff --git a/.gitignore b/.gitignore
@@ -129,10 +129,13 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-
 data
 docker
 results
 nb_*
 .Trash-0
-*.sh
+*.sh
+.DS_Store
+experiments
+*.tar.gz
+*.zip
diff --git a/calculate_representativeness.py b/calculate_representativeness.py
@@ -0,0 +1,56 @@
+from lm_survey.survey.results import SurveyResults
+import numpy as np
+import os
+import json
+from lm_survey.survey.dependent_variable_sample import DependentVariableSample
+import glob
+import pandas as pd
+import sys
+
+
+# Grab all the files called "results.json" in the "experiments" directory
+input_filepaths = glob.glob(
+    os.path.join("experiments/breadth", "**", "results.json"), recursive=True
+)
+
+print(input_filepaths)
+
+# read input filepaths into pandas dfs
+dfs = []
+mean_reps = {}
+for input_filepath in input_filepaths:
+    with open(input_filepath, "r") as file:
+        results = json.load(file)
+    question_samples = [
+        DependentVariableSample(
+            **sample_dict,
+        )
+        for sample_dict in results
+    ]
+
+    survey_results = SurveyResults(question_samples=question_samples)
+    wave = input_filepath.split("/")[3][-3:]
+    # mean_reps[wave] = survey_results.get_representativeness()
+    print(f"{wave}: {survey_results.calculate_avg_samples()}")
+
+# print("Average representativeness: ", np.mean(list(mean_reps.values())))
+# print(
+#     "Average representativeness per : \n",
+#     [f"{k}: {v}\n" for k, v in mean_reps.items()],
+# )
+
+
+# with open(input_filepath, "r") as file:
+#     results = json.load(file)
+
+# question_samples = [
+#     DependentVariableSample(
+#         **sample_dict,
+#     )
+#     for sample_dict in results["llama-7b-hf"]
+# ]
+
+# survey_results = SurveyResults(question_samples=question_samples)
+
+# # Print with 2 decimal places
+# print(survey_results.get_mean_score(slice_by=["gender"]).round(2))
diff --git a/check_survey_prompts.py b/check_survey_prompts.py
@@ -0,0 +1,70 @@
+import argparse
+import json
+import os
+import typing
+from pathlib import Path
+
+from tqdm import tqdm
+
+from lm_survey.survey import Survey
+
+
+def check_survey_prompts(
+    survey_name: str,
+    experiment_name: str,
+):
+    data_dir = os.path.join("data", survey_name)
+    variables_dir = os.path.join("variables", survey_name)
+    experiment_dir = os.path.join("experiments", experiment_name, survey_name)
+
+    with open(os.path.join(experiment_dir, "config.json"), "r") as file:
+        config = json.load(file)
+
+    print(os.path.join(variables_dir, "variables.json"))
+
+    survey = Survey(
+        name=survey_name,
+        data_filename=os.path.join(data_dir, "data.csv"),
+        variables_filename=os.path.join(variables_dir, "variables.json"),
+        independent_variable_names=config["independent_variable_names"],
+        dependent_variable_names=config["dependent_variable_names"],
+    )
+
+    next_survey_sample = next(survey.iterate())
+    print(f"## EXAMPLE PROMPT FOR {data_dir}:")
+    print()
+    print('"""')
+    print(
+        f"{next_survey_sample.prompt}█{next_survey_sample.completion.correct_completion}"
+    )
+    print('"""')
+    print()
+    print(f"## DEMOGRAPHICS NATURAL LANGUAGE SUMMARY FOR {data_dir}:")
+    print()
+    survey.print_demographics_natural_language_summary()
+
+
+def main(survey_directories: typing.List[Path], experiment_name: str) -> None:
+    for survey_directory in survey_directories:
+        check_survey_prompts(survey_directory, experiment_name)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Positional argument for survey dir(s)
+    parser.add_argument(
+        "survey_directory",
+        nargs="+",
+        type=Path,
+    )
+    parser.add_argument(
+        "-e",
+        "--experiment_name",
+        type=str,
+        default="default",
+    )
+
+    args = parser.parse_args()
+
+    main(survey_directories=args.survey_directory, experiment_name=args.experiment_name)
diff --git a/configure_atp.py b/configure_atp.py
@@ -0,0 +1,39 @@
+import argparse
+import json
+import os
+from pathlib import Path
+
+from lm_survey.survey.survey import Survey
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "wave", type=Path, nargs="+", help="Path(s) to wave of ATP to configure"
+    )
+    parser.add_argument("output_path", type=Path, help="Path to output directory")
+    parser.add_argument(
+        "--base-variables", type=Path, help="Path to optional base variables"
+    )
+    args = parser.parse_args()
+
+    base_variables = None
+    if args.base_variables:
+        with args.base_variables.open("r") as f:
+            base_variables = json.load(f)
+
+    for wave in args.wave:
+        survey = Survey(name="ATP_W92", data_filename=wave / "data.csv")
+
+        wave_output_dir = args.output_path / wave
+        wave_output_dir.mkdir(parents=True, exist_ok=True)
+
+        output_variables_path = wave_output_dir / "variables.json"
+        survey.generate_atp_variables(wave, wave_output_dir / "variables.json")
+
+        # This is a simple way to put some extra stuff in the variables file
+        if base_variables:
+            with output_variables_path.open("r") as f:
+                variables = json.load(f)
+            variables.extend(base_variables)
+            with output_variables_path.open("w") as f:
+                json.dump(variables, f, indent=2)
diff --git a/create_atp_experiment.py b/create_atp_experiment.py
@@ -0,0 +1,56 @@
+import argparse
+import json
+from pathlib import Path
+
+import pandas as pd
+from tqdm import tqdm
+
+
+def main(survey_name: str, experiment_name: str) -> None:
+    data_dir = Path("data") / survey_name
+    experiment_dir = Path("experiments") / experiment_name / survey_name
+
+    # create experiment dir
+    if not experiment_dir.exists():
+        experiment_dir.mkdir(parents=True, exist_ok=True)
+
+    info_csv_path = data_dir / "info.csv"
+    metadata_csv_path = data_dir / "metadata.csv"
+
+    info_df = pd.read_csv(info_csv_path)
+    metadata_df = pd.read_csv(metadata_csv_path)
+
+    experiment_config = {
+        "independent_variable_names": list(metadata_df["key"]),
+        "dependent_variable_names": list(info_df["key"])
+    }
+
+    with (experiment_dir / "config.json").open("w") as file:
+        json.dump(experiment_config, file, indent=4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-s",
+        "--survey_name",
+        type=str,
+        default="all",
+    )
+    parser.add_argument(
+        "-e",
+        "--experiment_name",
+        type=str,
+        default="default",
+    )
+
+    args = parser.parse_args()
+
+    if args.survey_name == "all":
+        paths = sorted(Path("data").glob("ATP/American*/"))
+        for path in tqdm(paths):
+            args.survey_name = str(path.relative_to("data"))
+            main(survey_name=args.survey_name, experiment_name=args.experiment_name)
+    else:
+        main(survey_name=args.survey_name, experiment_name=args.experiment_name)
diff --git a/estimate_survey.py b/estimate_survey.py
@@ -0,0 +1,144 @@
+import argparse
+import json
+import os
+import typing
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from lm_survey.samplers import AutoSampler, BaseSampler
+from lm_survey.survey import Survey
+
+
+def estimate_survey_costs(
+    sampler: BaseSampler,
+    survey_name: str,
+    experiment_name: str,
+    *,
+    n_samples_per_dependent_variable: typing.Optional[int] = None,
+    n_top_mutual_info_dvs: typing.Optional[int] = None,
+):
+    data_dir = os.path.join("data", survey_name)
+    variables_dir = os.path.join("variables", survey_name)
+    experiment_dir = os.path.join("experiments", experiment_name, survey_name)
+
+    with open(os.path.join(experiment_dir, "config.json"), "r") as file:
+        config = json.load(file)
+
+    survey = Survey(
+        name=survey_name,
+        data_filename=os.path.join(data_dir, "data.csv"),
+        variables_filename=os.path.join(variables_dir, "variables.json"),
+        independent_variable_names=config["independent_variable_names"],
+        dependent_variable_names=config["dependent_variable_names"],
+    )
+
+    dependent_variable_samples = list(
+        survey.iterate(
+            n_samples_per_dependent_variable=n_samples_per_dependent_variable
+        )
+    )
+
+    prompt_count = len(dependent_variable_samples)
+
+    if hasattr(sampler, "batch_estimate_prompt_cost"):
+        completion_costs = sampler.batch_estimate_prompt_cost(
+            [
+                dependent_variable_sample.prompt
+                for dependent_variable_sample in dependent_variable_samples
+            ]
+        )
+    else:
+        completion_costs = []
+        for dependent_variable_sample in tqdm(dependent_variable_samples):
+            completion_cost = sampler.estimate_prompt_cost(
+                dependent_variable_sample.prompt
+            )
+            completion_costs.append(completion_cost)
+
+    total_completion_cost = np.sum(completion_costs)
+
+    return {
+        "prompt_count": prompt_count,
+        "cost": total_completion_cost,
+    }
+
+
+def main(
+    model_name: str,
+    survey_names: typing.List[str],
+    experiment_name: str,
+    n_samples_per_dependent_variable: typing.Optional[int] = None,
+    n_top_mutual_info_dvs: typing.Optional[int] = None,
+) -> None:
+    sampler = AutoSampler(model_name=model_name)
+
+    survey_costs = {}
+    for survey_name in tqdm(survey_names):
+        estimate = estimate_survey_costs(
+            sampler=sampler,
+            survey_name=survey_name,
+            experiment_name=experiment_name,
+            n_samples_per_dependent_variable=n_samples_per_dependent_variable,
+            n_top_mutual_info_dvs=n_top_mutual_info_dvs,
+        )
+        survey_costs[survey_name] = estimate
+
+    total_cost = sum([estimate["cost"] for estimate in survey_costs.values()])
+
+    total_prompt_count = sum(
+        [estimate["prompt_count"] for estimate in survey_costs.values()]
+    )
+
+    if len(survey_names) > 1:
+        print(f"Cost per survey:")
+        for survey_name, survey_cost in survey_costs.items():
+            print(
+                f"{survey_name}: ${(survey_cost['cost'] / 100):.2f} ({survey_cost['prompt_count']}"
+                " prompts)"
+            )
+
+    print(f"Total cost: ${(total_cost / 100):.2f} ({total_prompt_count} prompts)")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-m",
+        "--model_name",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "-n",
+        "--n_samples_per_dependent_variable",
+        type=int,
+    )
+    parser.add_argument(
+        "--n_top_mutual_info_dvs",
+        type=int,
+    )
+    parser.add_argument(
+        "-e",
+        "--experiment_name",
+        type=str,
+        default="default",
+    )
+    # Positional argument for survey dir(s)
+    parser.add_argument(
+        "survey_name",
+        nargs="+",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    main(
+        model_name=args.model_name,
+        survey_names=args.survey_name,
+        experiment_name=args.experiment_name,
+        n_samples_per_dependent_variable=args.n_samples_per_dependent_variable,
+        n_top_mutual_info_dvs=args.n_top_mutual_info_dvs,
+    )