Add llama benchmarks

huridocs · Nov 28, 2024 · 5daf071 · 5daf071
1 parent 3102084
commit 5daf071
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -82,12 +82,14 @@ Performance 100 samples
 | Model        | Prompt   | Arabic-English | English-Spanish | English-French | English-Russian |
 |--------------|----------|----------------|-----------------|----------------|-----------------|
 | DeepL        |          | 33.11          | -               | 36.05          | 24.64           |
+| llama3.1:70B | Prompt 3 | -              | -               | 32.65          | -               |
 | aya-35b      | Prompt 2 | 30.75          | -               | 31.48          | 20.06           |
 | glm4:9b      | Prompt 2 | 19.62          | -               | 30.21          | 16.12           |
 | glm-BF16-64  | Prompt 2 | 18.75          | -               | 28.84          | 17.20           |
 | glm-BF16-128 | Prompt 2 | 20.05          | -               | 30.09          | 17.82           |
 | llama3.1-8B  | Prompt 2 | 10.52          | 25.37           | 27.53          | 14.04           |
-| llama3.2-3B  | Prompt 3 | -              | -               | 15.88          | -               |
+| llama3.1-8B  | Prompt 3 | -              | -               | 26.57          | -               |
+| llama3.2-3B  | Prompt 3 | -              | -               | 19.70          | -               |
 
 
 

diff --git a/test-00000-of-00001.parquet → en-fr/test-00000-of-00001.parquet b/test-00000-of-00001.parquet → en-fr/test-00000-of-00001.parquet
diff --git a/src/benchmark_models.py b/src/benchmark_models.py
@@ -10,6 +10,7 @@
 from tqdm import tqdm
 from huggingface_hub import hf_hub_download
 
+from configuration import ROOT_PATH
 from data_model.TranslationTask import TranslationTask
 from fast_bleu import BLEU
 from translate import get_content
@@ -95,7 +96,7 @@ def get_performance(samples: list[tuple[str, str]], path: Path):
         predictions += json.loads(Path(join(path, file)).read_text())
     average_performance = 0
     for i, (text_from, text_to) in tqdm(enumerate(samples)):
-        prediction = predictions[i]
+        prediction = predictions[i].replace("```", "")
         average_performance += get_bleu_score(text_to, prediction)
 
     print(f"Average performance: {100 * average_performance / len(samples)}")
@@ -129,5 +130,5 @@ def get_characters_to_translate():
 
     # benchmark("aya:35b", "ar-en", 100)
     # benchmark("glm4:9b", "en-fr", 100)
-    benchmark("llama3.1", "en-fr")
+    benchmark("llama3.1:70b", "en-fr", 100)
     print("time", round(time() - start, 2), "s")