SlimeVR · ButterscotchV · Oct 8, 2024 · Oct 8, 2024 · Oct 13, 2024 · Oct 13, 2024
diff --git a/bingus-python-encoder/data_utils.py b/bingus-python-encoder/data_utils.py
@@ -1,30 +1,20 @@
+import math
 import os
+from typing import TypeAlias
 from pydantic import BaseModel
 from datasets import Dataset
+from typo import StrErrer
+from random import Random
 
+RandomSeed: TypeAlias = int | float | str | bytes | bytearray | None
 
-class FaqEntry(BaseModel):
-    title: str
-    answer: str
-    matched_questions: list[str]
 
-
-class FaqConfig(BaseModel):
-    faqs: list[FaqEntry]
-
-
-def load_faq_config(paths: list[str]) -> FaqConfig:
-    """
-    Searches through a list of paths to find and load the first existing faq_config.json file.
-    Raises a FileNotFoundError if none of the paths exist.
-    """
-    for path in paths:
-        if os.path.isfile(path):
-            print(f"Found \"faq_config.json\" at \"{path}\"!")
-            with open(path, "r") as f:
-                return FaqConfig.model_validate_json(f.read())
-    raise FileNotFoundError(
-        "Could not find \"faq_config.json\" in any of the default paths.")
+def split_dataset(dataset: Dataset, eval_percent: float | int) -> tuple[Dataset, Dataset | None]:
+    """Splits the dataset into training and evaluation sets based on the evaluation percentage."""
+    if eval_percent > 0:
+        split = dataset.train_test_split(test_size=eval_percent)
+        return split["train"], split["test"]
+    return dataset, None
 
 
 def generate_entry_pairs(entries: list[list[str]]) -> Dataset:
@@ -56,69 +46,165 @@ def generate_entry_pairs(entries: list[list[str]]) -> Dataset:
     })
 
 
-def generate_question_pairs(faqs: list[FaqEntry]) -> Dataset:
-    """
-    Generates question-to-question pairs from the FAQs, where each question is paired with all
-    other questions in its set (positive samples) and from other sets (negative sample).
-    """
-    return generate_entry_pairs([faq.matched_questions for faq in faqs])
+def random_typo(str_err: StrErrer, random: Random) -> StrErrer:
+    """Applies a random typo to a string."""
+    typo_type = random.randint(0, 7)
+    if typo_type == 0:
+        return str_err.char_swap()
+    if typo_type == 1:
+        return str_err.missing_char()
+    if typo_type == 2:
+        return str_err.extra_char()
+    if typo_type == 3:
+        return str_err.nearby_char()
+    if typo_type == 4:
+        return str_err.skipped_space()
+    if typo_type == 5:
+        return str_err.random_space()
+    if typo_type == 6:
+        return str_err.repeated_char()
+    return str_err.unichar()
 
 
-def generate_question_answer_pairs(faqs: list[FaqEntry], include_title: bool = True) -> Dataset:
-    """
-    Generates question-answer pairs from the FAQs, where each question is paired with its correct
-    answer (positive sample) and other incorrect answers (negative samples).
-    """
-    questions, answers, scores = [], [], []
-
-    # Precompute all answers for negative samples
-    all_answers = [faq.answer for faq in faqs]
-
-    for faq in faqs:
-        for question in faq.matched_questions:
-            # Positive sample (correct answer)
-            questions.append(question)
-            answers.append(faq.answer)
-            scores.append(1.0)
-
-            # Negative samples (incorrect answers)
-            for other_answer in all_answers:
-                if other_answer != faq.answer:
-                    questions.append(question)
-                    answers.append(other_answer)
-                    scores.append(0.0)
-
-        if include_title:
-            # Positive sample (correct answer)
-            questions.append(faq.title)
-            answers.append(faq.answer)
-            scores.append(1.0)
-
-            # Negative samples (incorrect answers)
-            for other_answer in all_answers:
-                if other_answer != faq.answer:
-                    questions.append(faq.title)
-                    answers.append(other_answer)
-                    scores.append(0.0)
-
-    return Dataset.from_dict({
-        "sentence1": questions,
-        "sentence2": answers,
-        "score": scores,
-    })
+class FaqEntry(BaseModel):
+    title: str | None
+    answer: str
+    matched_questions: list[str]
 
 
-def generate_everything_pairs(faqs: list[FaqEntry]) -> Dataset:
-    """
-    Generates pairs of titles, answers, and questions from the FAQs, where each set is paired with its correct
-    answer (positive sample) and other incorrect answers (negative samples).
-    """
-    return generate_entry_pairs([[faq.title, faq.answer, *faq.matched_questions] for faq in faqs])
+class FaqConfig(BaseModel):
+    faqs: list[FaqEntry]
 
+    @staticmethod
+    def load_from_file(paths: list[str] | str):
+        """
+        Searches through a list of paths to find and load the first existing faq_config.json file.
+        Raises a FileNotFoundError if none of the paths exist.
+        """
+        for path in paths:
+            if os.path.isfile(path):
+                print(f"Found \"faq_config.json\" at \"{path}\"!")
+                with open(path, "r") as f:
+                    return FaqConfig.model_validate_json(f.read())
+        raise FileNotFoundError(
+            "Could not find \"faq_config.json\" in any of the default paths.")
+
+    def save_to_file(self, path: str):
+        """
+        Saves a faq_config.json file to the specified path.
+        """
+        with open(path, "w") as f:
+            f.write(self.model_dump_json())
+
+    def iterate_answers(self):
+        for faq in self.faqs:
+            yield faq.answer
+
+    def question_count(self):
+        return sum((len(faq.matched_questions) for faq in self.faqs))
+
+    def filter_short_questions(self, min_words: int):
+        """
+        Filters out questions shorter than min_words and removes empty entries.
+        """
+        for faq in self.faqs:
+            faq.matched_questions = [
+                q for q in faq.matched_questions if len(q.split()) >= min_words]
+        self.faqs = [faq for faq in self.faqs if len(
+            faq.matched_questions) > 0]
+
+    def generate_typos(
+            self,
+            entry_variants: int,
+            min_typos: int,
+            max_typos: int,
+            scale_max_per_word: bool = True,
+            scale_min_per_word: bool = False,
+            per_word_multiplier: float = 1.0,
+            seed: RandomSeed = None
+    ) -> tuple[int, int]:
+        """
+        Generates typos for each question of each entry and returns the number of entries added and the
+        number of typos generated.
+        """
+        if entry_variants < 1:
+            raise ValueError(
+                "entry_variants must be greater than or equal to 1")
+        if min_typos < 0:
+            raise ValueError("min_typos must be greater than or equal to 0")
+        if max_typos < 1:
+            raise ValueError("max_typos must be greater than or equal to 1")
+        if min_typos > max_typos:
+            raise ValueError(
+                "min_typos must be less than or equal to max_typos")
+
+        seeded_random = Random(seed)
+        typo_entry_count = 0
+        typo_count = 0
+        for faq in self.faqs:
+            new_qs: list[str] = []
+
+            for question in faq.matched_questions:
+                q_min_typos = min_typos
+                q_max_typos = max_typos
+                if scale_max_per_word:
+                    num_words = max(1, len(question.split())
+                                    * per_word_multiplier)
+                    q_max_typos *= num_words
+                    if scale_min_per_word:
+                        q_min_typos *= num_words
+
+                for _ in range(entry_variants):
+                    num_typos = seeded_random.randint(
+                        math.ceil(q_min_typos), math.ceil(q_max_typos))
+                    typo_q = StrErrer(question, seed=seeded_random.random())
+                    for _ in range(num_typos):
+                        typo_q = random_typo(typo_q, seeded_random)
+                    new_qs.append(typo_q.result)
+                    typo_count += num_typos
+
+            faq.matched_questions.extend(new_qs)
+            typo_entry_count += len(new_qs)
+
+        return typo_entry_count, typo_count
+
+    def generate_question_pairs(self) -> Dataset:
+        """
+        Generates question-to-question pairs from the FAQs, where each question is paired with all
+        other questions in its set (positive samples) and from other sets (negative sample).
+        """
+        return generate_entry_pairs([faq.matched_questions for faq in self.faqs])
+
+    def generate_question_answer_pairs(self) -> Dataset:
+        """
+        Generates question-answer pairs from the FAQs, where each question is paired with its correct
+        answer (positive sample) and other incorrect answers (negative samples).
+        """
+        questions, answers, scores = [], [], []
+
+        for faq in self.faqs:
+            for question in faq.matched_questions:
+                # Positive sample (correct answer)
+                questions.append(question)
+                answers.append(faq.answer)
+                scores.append(1.0)
 
-def split_dataset(dataset: Dataset, eval_percent: float | int) -> tuple[Dataset, Dataset | None]:
-    """Splits the dataset into training and evaluation sets based on the evaluation percentage."""
-    if eval_percent > 0:
-        split = dataset.train_test_split(test_size=eval_percent)
-        return split["train"], split["test"]
-    return dataset, None
+                # Negative samples (incorrect answers)
+                for other_answer in self.iterate_answers():
+                    if other_answer != faq.answer:
+                        questions.append(question)
+                        answers.append(other_answer)
+                        scores.append(0.0)
+
+        return Dataset.from_dict({
+            "sentence1": questions,
+            "sentence2": answers,
+            "score": scores,
+        })
+
+    def generate_everything_pairs(self) -> Dataset:
+        """
+        Generates pairs of titles, answers, and questions from the FAQs, where each set is paired with its correct
+        answer (positive sample) and other incorrect answers (negative samples).
+        """
+        return generate_entry_pairs([[faq.title, faq.answer, *faq.matched_questions] for faq in self.faqs])
diff --git a/bingus-python-encoder/fine-tune.py b/bingus-python-encoder/fine-tune.py
@@ -1,20 +1,27 @@
-from data_utils import load_faq_config, generate_question_pairs, generate_question_answer_pairs, generate_everything_pairs, split_dataset
+from data_utils import FaqConfig, split_dataset
 from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
-from sentence_transformers.losses import CoSENTLoss
+from sentence_transformers.losses import AnglELoss
 from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
-import math
+import os
 
 # Load FAQ configuration
-faqs = load_faq_config([
+faq_config = FaqConfig.load_from_file([
     "./faq_config.json",
     "../BingusApi/config/faq_config.json",
     "./BingusApi/config/faq_config.json"
-]).faqs
+])
+print(
+    f"Loaded FAQ config:\n  > {len(faq_config.faqs)} FAQs\n  > {faq_config.question_count()} questions")
+
+# FAQ modifiers
+filter_short_questions = True
+generate_faq_typos = True
+save_modified_faq = True
 
 # Data pairing mode
-# 0. Question to question (q2q)
-# 1. Question to answer (q2a)
-# 2. Everything to everything (e2e)
+# 0: Question to question (q2q)
+# 1: Question to answer (q2a)
+# 2: Everything to everything (e2e)
 pairing_modes = ["q2q", "q2a", "e2e"]
 pairing_mode = 1
 pairing_mode_name = pairing_modes[pairing_mode]
@@ -29,38 +36,63 @@
 base_model = "all-MiniLM-L6-v2"
 
 # Output model settings
-model_ver = 3
+model_ver = 4
 model_name = f"Bingus-{pairing_mode_name}-v{model_ver}{eval_name}_{base_model}"
 model_dir = f"./local-models/{model_name}/"
-output_path = f"{model_dir}{model_name}/"
-checkpoint_path = f"{model_dir}checkpoints/"
+os.makedirs(model_dir, exist_ok=True)
+
+# Modify FAQ config
+if filter_short_questions:
+    print("Filtering short questions...")
+    faq_config.filter_short_questions(4)
+    print(
+        f"Filtered FAQ config:\n  > {len(faq_config.faqs)} FAQs\n  > {faq_config.question_count()} questions")
+
+if generate_faq_typos:
+    print("Generating typos...")
+    typo_entry_count, typo_count = faq_config.generate_typos(
+        entry_variants=3,
+        min_typos=1,
+        max_typos=2,
+        scale_max_per_word=True,
+        scale_min_per_word=True,
+        per_word_multiplier=0.2,
+        seed=42
+    )
+    print(
+        f"Generated {typo_entry_count} new questions with {typo_count} typos.")
+
+if save_modified_faq:
+    faq_output = f"{model_dir}faq_config.json"
+    faq_config.save_to_file(faq_output)
+    print(f"Saved modified FAQ to \"{faq_output}\".")
 
 # Generate dataset and split if in eval mode
 print("Generating datasets...")
 if (pairing_mode == 0):
-    dataset = generate_question_pairs(faqs)
+    dataset = faq_config.generate_question_pairs()
 elif (pairing_mode == 1):
-    dataset = generate_question_answer_pairs(faqs)
+    dataset = faq_config.generate_question_answer_pairs()
 elif (pairing_mode == 2):
-    dataset = generate_everything_pairs(faqs)
+    dataset = faq_config.generate_everything_pairs()
 else:
     raise ValueError(f"Invalid pairing mode: {pairing_mode}")
 train_data, eval_data = split_dataset(dataset, eval_percent)
 
 print(
-    f"Generated datasets: \n  > Train: {train_data.num_rows} entries\n  > Eval: {0 if eval_data is None else eval_data.num_rows} entries")
+    f"Generated datasets:\n  > Train: {train_data.num_rows} entries\n  > Eval: {0 if eval_data is None else eval_data.num_rows} entries")
 
 # Load the model
 print("Loading model to fine-tune...")
 model = SentenceTransformer(base_model, cache_folder=model_cache)
 
 # Set training arguments
 args = SentenceTransformerTrainingArguments(
-    output_dir=checkpoint_path,
-    num_train_epochs=20,
-    per_device_train_batch_size=128,
-    per_device_eval_batch_size=128,
-    learning_rate=0.00005 * math.sqrt(128 / 16),
+    output_dir=f"{model_dir}checkpoints/",
+    num_train_epochs=4,
+    per_device_train_batch_size=64,
+    per_device_eval_batch_size=64,
+    learning_rate=0.00005,
     warmup_ratio=0.1,
     fp16=True,
     bf16=False,
@@ -97,9 +129,9 @@
     args=args,
     train_dataset=train_data,
     eval_dataset=eval_data,
-    loss=CoSENTLoss(model),
+    loss=AnglELoss(model),
     evaluator=dev_evaluator,
 )
 
 trainer.train(resume_from_checkpoint=False)
-model.save_pretrained(output_path)
+model.save_pretrained(f"{model_dir}{model_name}/")
diff --git a/bingus-python-encoder/requirements/fine-tuning.txt b/bingus-python-encoder/requirements/fine-tuning.txt
@@ -1,2 +1,3 @@
 sentence-transformers[train] >= 3.1.1
 gpl >= 0.1.4
+typo >= 0.1.7