From 5daf071feb66134379c5757e0a4018ba4e90ffce Mon Sep 17 00:00:00 2001
From: Gabo <gabriel.piles.glez@gmail.com>
Date: Thu, 28 Nov 2024 18:56:03 +0100
Subject: [PATCH] Add llama benchmarks

---
 README.md                                           |   4 +++-
 .../test-00000-of-00001.parquet                     | Bin
 src/benchmark_models.py                             |   5 +++--
 3 files changed, 6 insertions(+), 3 deletions(-)
 rename test-00000-of-00001.parquet => en-fr/test-00000-of-00001.parquet (100%)

diff --git a/README.md b/README.md
index dc2d146..dfa2eef 100644
--- a/README.md
+++ b/README.md
@@ -82,12 +82,14 @@ Performance 100 samples
 | Model        | Prompt   | Arabic-English | English-Spanish | English-French | English-Russian |
 |--------------|----------|----------------|-----------------|----------------|-----------------|
 | DeepL        |          | 33.11          | -               | 36.05          | 24.64           |
+| llama3.1:70B | Prompt 3 | -              | -               | 32.65          | -               |
 | aya-35b      | Prompt 2 | 30.75          | -               | 31.48          | 20.06           |
 | glm4:9b      | Prompt 2 | 19.62          | -               | 30.21          | 16.12           |
 | glm-BF16-64  | Prompt 2 | 18.75          | -               | 28.84          | 17.20           |
 | glm-BF16-128 | Prompt 2 | 20.05          | -               | 30.09          | 17.82           |
 | llama3.1-8B  | Prompt 2 | 10.52          | 25.37           | 27.53          | 14.04           |
-| llama3.2-3B  | Prompt 3 | -              | -               | 15.88          | -               |
+| llama3.1-8B  | Prompt 3 | -              | -               | 26.57          | -               |
+| llama3.2-3B  | Prompt 3 | -              | -               | 19.70          | -               |
 
 
 
diff --git a/test-00000-of-00001.parquet b/en-fr/test-00000-of-00001.parquet
similarity index 100%
rename from test-00000-of-00001.parquet
rename to en-fr/test-00000-of-00001.parquet
diff --git a/src/benchmark_models.py b/src/benchmark_models.py
index 1be3ebf..b55ff24 100644
--- a/src/benchmark_models.py
+++ b/src/benchmark_models.py
@@ -10,6 +10,7 @@
 from tqdm import tqdm
 from huggingface_hub import hf_hub_download
 
+from configuration import ROOT_PATH
 from data_model.TranslationTask import TranslationTask
 from fast_bleu import BLEU
 from translate import get_content
@@ -95,7 +96,7 @@ def get_performance(samples: list[tuple[str, str]], path: Path):
         predictions += json.loads(Path(join(path, file)).read_text())
     average_performance = 0
     for i, (text_from, text_to) in tqdm(enumerate(samples)):
-        prediction = predictions[i]
+        prediction = predictions[i].replace("```", "")
         average_performance += get_bleu_score(text_to, prediction)
 
     print(f"Average performance: {100 * average_performance / len(samples)}")
@@ -129,5 +130,5 @@ def get_characters_to_translate():
 
     # benchmark("aya:35b", "ar-en", 100)
     # benchmark("glm4:9b", "en-fr", 100)
-    benchmark("llama3.1", "en-fr")
+    benchmark("llama3.1:70b", "en-fr", 100)
     print("time", round(time() - start, 2), "s")