Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset generation and filtering, and import from other datasets #21

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
248 changes: 167 additions & 81 deletions bingus-python-encoder/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,20 @@
import math
import os
from typing import TypeAlias
from pydantic import BaseModel
from datasets import Dataset
from typo import StrErrer
from random import Random

RandomSeed: TypeAlias = int | float | str | bytes | bytearray | None

class FaqEntry(BaseModel):
title: str
answer: str
matched_questions: list[str]


class FaqConfig(BaseModel):
faqs: list[FaqEntry]


def load_faq_config(paths: list[str]) -> FaqConfig:
"""
Searches through a list of paths to find and load the first existing faq_config.json file.
Raises a FileNotFoundError if none of the paths exist.
"""
for path in paths:
if os.path.isfile(path):
print(f"Found \"faq_config.json\" at \"{path}\"!")
with open(path, "r") as f:
return FaqConfig.model_validate_json(f.read())
raise FileNotFoundError(
"Could not find \"faq_config.json\" in any of the default paths.")
def split_dataset(dataset: Dataset, eval_percent: float | int) -> tuple[Dataset, Dataset | None]:
"""Splits the dataset into training and evaluation sets based on the evaluation percentage."""
if eval_percent > 0:
split = dataset.train_test_split(test_size=eval_percent)
return split["train"], split["test"]
return dataset, None


def generate_entry_pairs(entries: list[list[str]]) -> Dataset:
Expand Down Expand Up @@ -56,69 +46,165 @@ def generate_entry_pairs(entries: list[list[str]]) -> Dataset:
})


def generate_question_pairs(faqs: list[FaqEntry]) -> Dataset:
"""
Generates question-to-question pairs from the FAQs, where each question is paired with all
other questions in its set (positive samples) and from other sets (negative sample).
"""
return generate_entry_pairs([faq.matched_questions for faq in faqs])
def random_typo(str_err: StrErrer, random: Random) -> StrErrer:
"""Applies a random typo to a string."""
typo_type = random.randint(0, 7)
if typo_type == 0:
return str_err.char_swap()
if typo_type == 1:
return str_err.missing_char()
if typo_type == 2:
return str_err.extra_char()
if typo_type == 3:
return str_err.nearby_char()
if typo_type == 4:
return str_err.skipped_space()
if typo_type == 5:
return str_err.random_space()
if typo_type == 6:
return str_err.repeated_char()
return str_err.unichar()


def generate_question_answer_pairs(faqs: list[FaqEntry], include_title: bool = True) -> Dataset:
"""
Generates question-answer pairs from the FAQs, where each question is paired with its correct
answer (positive sample) and other incorrect answers (negative samples).
"""
questions, answers, scores = [], [], []

# Precompute all answers for negative samples
all_answers = [faq.answer for faq in faqs]

for faq in faqs:
for question in faq.matched_questions:
# Positive sample (correct answer)
questions.append(question)
answers.append(faq.answer)
scores.append(1.0)

# Negative samples (incorrect answers)
for other_answer in all_answers:
if other_answer != faq.answer:
questions.append(question)
answers.append(other_answer)
scores.append(0.0)

if include_title:
# Positive sample (correct answer)
questions.append(faq.title)
answers.append(faq.answer)
scores.append(1.0)

# Negative samples (incorrect answers)
for other_answer in all_answers:
if other_answer != faq.answer:
questions.append(faq.title)
answers.append(other_answer)
scores.append(0.0)

return Dataset.from_dict({
"sentence1": questions,
"sentence2": answers,
"score": scores,
})
class FaqEntry(BaseModel):
title: str | None
answer: str
matched_questions: list[str]


def generate_everything_pairs(faqs: list[FaqEntry]) -> Dataset:
"""
Generates pairs of titles, answers, and questions from the FAQs, where each set is paired with its correct
answer (positive sample) and other incorrect answers (negative samples).
"""
return generate_entry_pairs([[faq.title, faq.answer, *faq.matched_questions] for faq in faqs])
class FaqConfig(BaseModel):
faqs: list[FaqEntry]

@staticmethod
def load_from_file(paths: list[str] | str):
"""
Searches through a list of paths to find and load the first existing faq_config.json file.
Raises a FileNotFoundError if none of the paths exist.
"""
for path in paths:
if os.path.isfile(path):
print(f"Found \"faq_config.json\" at \"{path}\"!")
with open(path, "r") as f:
return FaqConfig.model_validate_json(f.read())
raise FileNotFoundError(
"Could not find \"faq_config.json\" in any of the default paths.")

def save_to_file(self, path: str):
"""
Saves a faq_config.json file to the specified path.
"""
with open(path, "w") as f:
f.write(self.model_dump_json())

def iterate_answers(self):
for faq in self.faqs:
yield faq.answer

def question_count(self):
return sum((len(faq.matched_questions) for faq in self.faqs))

def filter_short_questions(self, min_words: int):
"""
Filters out questions shorter than min_words and removes empty entries.
"""
for faq in self.faqs:
faq.matched_questions = [
q for q in faq.matched_questions if len(q.split()) >= min_words]
self.faqs = [faq for faq in self.faqs if len(
faq.matched_questions) > 0]

def generate_typos(
self,
entry_variants: int,
min_typos: int,
max_typos: int,
scale_max_per_word: bool = True,
scale_min_per_word: bool = False,
per_word_multiplier: float = 1.0,
seed: RandomSeed = None
) -> tuple[int, int]:
"""
Generates typos for each question of each entry and returns the number of entries added and the
number of typos generated.
"""
if entry_variants < 1:
raise ValueError(
"entry_variants must be greater than or equal to 1")
if min_typos < 0:
raise ValueError("min_typos must be greater than or equal to 0")
if max_typos < 1:
raise ValueError("max_typos must be greater than or equal to 1")
if min_typos > max_typos:
raise ValueError(
"min_typos must be less than or equal to max_typos")

seeded_random = Random(seed)
typo_entry_count = 0
typo_count = 0
for faq in self.faqs:
new_qs: list[str] = []

for question in faq.matched_questions:
q_min_typos = min_typos
q_max_typos = max_typos
if scale_max_per_word:
num_words = max(1, len(question.split())
* per_word_multiplier)
q_max_typos *= num_words
if scale_min_per_word:
q_min_typos *= num_words

for _ in range(entry_variants):
num_typos = seeded_random.randint(
math.ceil(q_min_typos), math.ceil(q_max_typos))
typo_q = StrErrer(question, seed=seeded_random.random())
for _ in range(num_typos):
typo_q = random_typo(typo_q, seeded_random)
new_qs.append(typo_q.result)
typo_count += num_typos

faq.matched_questions.extend(new_qs)
typo_entry_count += len(new_qs)

return typo_entry_count, typo_count

def generate_question_pairs(self) -> Dataset:
"""
Generates question-to-question pairs from the FAQs, where each question is paired with all
other questions in its set (positive samples) and from other sets (negative sample).
"""
return generate_entry_pairs([faq.matched_questions for faq in self.faqs])

def generate_question_answer_pairs(self) -> Dataset:
"""
Generates question-answer pairs from the FAQs, where each question is paired with its correct
answer (positive sample) and other incorrect answers (negative samples).
"""
questions, answers, scores = [], [], []

for faq in self.faqs:
for question in faq.matched_questions:
# Positive sample (correct answer)
questions.append(question)
answers.append(faq.answer)
scores.append(1.0)

def split_dataset(dataset: Dataset, eval_percent: float | int) -> tuple[Dataset, Dataset | None]:
"""Splits the dataset into training and evaluation sets based on the evaluation percentage."""
if eval_percent > 0:
split = dataset.train_test_split(test_size=eval_percent)
return split["train"], split["test"]
return dataset, None
# Negative samples (incorrect answers)
for other_answer in self.iterate_answers():
if other_answer != faq.answer:
questions.append(question)
answers.append(other_answer)
scores.append(0.0)

return Dataset.from_dict({
"sentence1": questions,
"sentence2": answers,
"score": scores,
})

def generate_everything_pairs(self) -> Dataset:
"""
Generates pairs of titles, answers, and questions from the FAQs, where each set is paired with its correct
answer (positive sample) and other incorrect answers (negative samples).
"""
return generate_entry_pairs([[faq.title, faq.answer, *faq.matched_questions] for faq in self.faqs])
76 changes: 54 additions & 22 deletions bingus-python-encoder/fine-tune.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
from data_utils import load_faq_config, generate_question_pairs, generate_question_answer_pairs, generate_everything_pairs, split_dataset
from data_utils import FaqConfig, split_dataset
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import CoSENTLoss
from sentence_transformers.losses import AnglELoss
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import math
import os

# Load FAQ configuration
faqs = load_faq_config([
faq_config = FaqConfig.load_from_file([
"./faq_config.json",
"../BingusApi/config/faq_config.json",
"./BingusApi/config/faq_config.json"
]).faqs
])
print(
f"Loaded FAQ config:\n > {len(faq_config.faqs)} FAQs\n > {faq_config.question_count()} questions")

# FAQ modifiers
filter_short_questions = True
generate_faq_typos = True
save_modified_faq = True

# Data pairing mode
# 0. Question to question (q2q)
# 1. Question to answer (q2a)
# 2. Everything to everything (e2e)
# 0: Question to question (q2q)
# 1: Question to answer (q2a)
# 2: Everything to everything (e2e)
pairing_modes = ["q2q", "q2a", "e2e"]
pairing_mode = 1
pairing_mode_name = pairing_modes[pairing_mode]
Expand All @@ -29,38 +36,63 @@
base_model = "all-MiniLM-L6-v2"

# Output model settings
model_ver = 3
model_ver = 4
model_name = f"Bingus-{pairing_mode_name}-v{model_ver}{eval_name}_{base_model}"
model_dir = f"./local-models/{model_name}/"
output_path = f"{model_dir}{model_name}/"
checkpoint_path = f"{model_dir}checkpoints/"
os.makedirs(model_dir, exist_ok=True)

# Modify FAQ config
if filter_short_questions:
print("Filtering short questions...")
faq_config.filter_short_questions(4)
print(
f"Filtered FAQ config:\n > {len(faq_config.faqs)} FAQs\n > {faq_config.question_count()} questions")

if generate_faq_typos:
print("Generating typos...")
typo_entry_count, typo_count = faq_config.generate_typos(
entry_variants=3,
min_typos=1,
max_typos=2,
scale_max_per_word=True,
scale_min_per_word=True,
per_word_multiplier=0.2,
seed=42
)
print(
f"Generated {typo_entry_count} new questions with {typo_count} typos.")

if save_modified_faq:
faq_output = f"{model_dir}faq_config.json"
faq_config.save_to_file(faq_output)
print(f"Saved modified FAQ to \"{faq_output}\".")

# Generate dataset and split if in eval mode
print("Generating datasets...")
if (pairing_mode == 0):
dataset = generate_question_pairs(faqs)
dataset = faq_config.generate_question_pairs()
elif (pairing_mode == 1):
dataset = generate_question_answer_pairs(faqs)
dataset = faq_config.generate_question_answer_pairs()
elif (pairing_mode == 2):
dataset = generate_everything_pairs(faqs)
dataset = faq_config.generate_everything_pairs()
else:
raise ValueError(f"Invalid pairing mode: {pairing_mode}")
train_data, eval_data = split_dataset(dataset, eval_percent)

print(
f"Generated datasets: \n > Train: {train_data.num_rows} entries\n > Eval: {0 if eval_data is None else eval_data.num_rows} entries")
f"Generated datasets:\n > Train: {train_data.num_rows} entries\n > Eval: {0 if eval_data is None else eval_data.num_rows} entries")

# Load the model
print("Loading model to fine-tune...")
model = SentenceTransformer(base_model, cache_folder=model_cache)

# Set training arguments
args = SentenceTransformerTrainingArguments(
output_dir=checkpoint_path,
num_train_epochs=20,
per_device_train_batch_size=128,
per_device_eval_batch_size=128,
learning_rate=0.00005 * math.sqrt(128 / 16),
output_dir=f"{model_dir}checkpoints/",
num_train_epochs=4,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
learning_rate=0.00005,
warmup_ratio=0.1,
fp16=True,
bf16=False,
Expand Down Expand Up @@ -97,9 +129,9 @@
args=args,
train_dataset=train_data,
eval_dataset=eval_data,
loss=CoSENTLoss(model),
loss=AnglELoss(model),
evaluator=dev_evaluator,
)

trainer.train(resume_from_checkpoint=False)
model.save_pretrained(output_path)
model.save_pretrained(f"{model_dir}{model_name}/")
1 change: 1 addition & 0 deletions bingus-python-encoder/requirements/fine-tuning.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
sentence-transformers[train] >= 3.1.1
gpl >= 0.1.4
typo >= 0.1.7
Loading