From 5daf071feb66134379c5757e0a4018ba4e90ffce Mon Sep 17 00:00:00 2001 From: Gabo Date: Thu, 28 Nov 2024 18:56:03 +0100 Subject: [PATCH] Add llama benchmarks --- README.md | 4 +++- .../test-00000-of-00001.parquet | Bin src/benchmark_models.py | 5 +++-- 3 files changed, 6 insertions(+), 3 deletions(-) rename test-00000-of-00001.parquet => en-fr/test-00000-of-00001.parquet (100%) diff --git a/README.md b/README.md index dc2d146..dfa2eef 100644 --- a/README.md +++ b/README.md @@ -82,12 +82,14 @@ Performance 100 samples | Model | Prompt | Arabic-English | English-Spanish | English-French | English-Russian | |--------------|----------|----------------|-----------------|----------------|-----------------| | DeepL | | 33.11 | - | 36.05 | 24.64 | +| llama3.1:70B | Prompt 3 | - | - | 32.65 | - | | aya-35b | Prompt 2 | 30.75 | - | 31.48 | 20.06 | | glm4:9b | Prompt 2 | 19.62 | - | 30.21 | 16.12 | | glm-BF16-64 | Prompt 2 | 18.75 | - | 28.84 | 17.20 | | glm-BF16-128 | Prompt 2 | 20.05 | - | 30.09 | 17.82 | | llama3.1-8B | Prompt 2 | 10.52 | 25.37 | 27.53 | 14.04 | -| llama3.2-3B | Prompt 3 | - | - | 15.88 | - | +| llama3.1-8B | Prompt 3 | - | - | 26.57 | - | +| llama3.2-3B | Prompt 3 | - | - | 19.70 | - | diff --git a/test-00000-of-00001.parquet b/en-fr/test-00000-of-00001.parquet similarity index 100% rename from test-00000-of-00001.parquet rename to en-fr/test-00000-of-00001.parquet diff --git a/src/benchmark_models.py b/src/benchmark_models.py index 1be3ebf..b55ff24 100644 --- a/src/benchmark_models.py +++ b/src/benchmark_models.py @@ -10,6 +10,7 @@ from tqdm import tqdm from huggingface_hub import hf_hub_download +from configuration import ROOT_PATH from data_model.TranslationTask import TranslationTask from fast_bleu import BLEU from translate import get_content @@ -95,7 +96,7 @@ def get_performance(samples: list[tuple[str, str]], path: Path): predictions += json.loads(Path(join(path, file)).read_text()) average_performance = 0 for i, (text_from, text_to) in tqdm(enumerate(samples)): - prediction = predictions[i] + prediction = predictions[i].replace("```", "") average_performance += get_bleu_score(text_to, prediction) print(f"Average performance: {100 * average_performance / len(samples)}") @@ -129,5 +130,5 @@ def get_characters_to_translate(): # benchmark("aya:35b", "ar-en", 100) # benchmark("glm4:9b", "en-fr", 100) - benchmark("llama3.1", "en-fr") + benchmark("llama3.1:70b", "en-fr", 100) print("time", round(time() - start, 2), "s")