-
Notifications
You must be signed in to change notification settings - Fork 523
/
Copy pathbenchmark.py
70 lines (56 loc) · 2.27 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
# benchmark a quantized GGML model with ollama API
import time
import argparse
import json
import requests
from pprint import pp
# Small LLM: tinyllama
DEFAULT_PROMPT = {
"model": "tinyllama",
"prompt": "Why is the sky blue?",
"options": {
"seed": 123,
"temperature": 0
},
"format": "json",
"stream": False,
}
# parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--model', type=str, default='tinyllama', required=True, help="name of model to run")
parser.add_argument('-p', '--prompt', type=str, default=DEFAULT_PROMPT.get("prompt"))
parser.add_argument('--runs', type=int, default=2, help='the number of benchmark timing iterations')
parser.add_argument('--OLLAMA_PID', type=str, default="", required=True, help='the pid of the ollama process')
args = parser.parse_args()
print(args)
data = DEFAULT_PROMPT.copy()
data['prompt'] = args.prompt
def get_max_memory_usage(PID: str = "self") -> None:
''' Maximum memory usage in bytes '''
with open(f'/proc/{PID}/status', encoding='utf-8') as f:
memusage = f.read().split('VmPeak:')[1].split('\n')[0][:-3]
return int(memusage.strip()) / 1024
def send_test_prompt(json_data: dict, url:str = "") -> requests.Response:
''' send a test prompt to local ollama container '''
if not url:
url = "127.0.0.1:11434"
return requests.post(url, json=json.dumps(json_data))
def run_benchmark(runs: int, json_data: dict, test_url: str = "") -> None:
''' run the benchmark '''
time_avg = 0.0
for run in range(runs):
time_begin = time.perf_counter()
response = send_test_prompt(json_data, test_url)
time_elapsed = (time.perf_counter() - time_begin)
if not response.ok:
pp(f'received error code from api service: {response.status_code}')
continue
pp(f'[+] run #{run}')
pp(f'[-] model: {json_data["model"]}, prompt: {json_data["prompt"]}')
pp(f'[-] response: {json.loads(response.text)["response"].strip()}, elapsed time: {time_elapsed}')
time_avg += float(time_elapsed)
pp(f'[+] peak ram used: {get_max_memory_usage(args.OLLAMA_PID) / 1024 / 1024} MB')
if runs > 0:
pp(f'[+] average time: {time_avg / runs:.2f}')
run_benchmark(args.runs, data, "")