-
Notifications
You must be signed in to change notification settings - Fork 0
/
cpu_only.py
85 lines (75 loc) · 3.42 KB
/
cpu_only.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Required libraries:
# llama-cpp-python
# huggingface_hub
# Easy installation command (make sure you're in your virtual environment first)
# pip install --upgrade --upgrade-strategy eager --force-reinstal --no-cache-dir llama-cpp-python huggingface_hub
# You'll need to create a HuggingFace account and create a token under your account Settings
# You'll also need some sort of C compiler, Visual Studio for Windows, gcc or clang for Linux and Git LFS
# If you're unable to get a C compiler up and running, there's a pre-built wheel version of llama-cpp-python:
# pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
# Once you've installed all the packages successfully, run the commands:
# git lfs install
# huggingface-cli login
#
# The huggingface-cli login command will require you to paste your HuggingFace token in, this allows you to pull models
# from HuggingFace's model database for use in Python code
# Import llama-cpp-python and the built-in timeit module
from llama_cpp import Llama
from timeit import default_timer as timer
print('Starting to get the LLM...')
# Get start time for getting the model
llm_start = timer()
# Pull down the model from HuggingFace
# The first time this runs, you'll need an internet connection to actually download the model itself.
# After the first time, the huggingface-cli tool will manage the models for you, stored locally in the HuggingFace
# cache directory.
# Substantial speed-up after running it once
llm = Llama.from_pretrained(
# Specify which HuggingFace repository the model is in
repo_id='QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2',
# Specify the name of the model file to download
filename='Meta-Llama-3-8B-Instruct-v2.Q6_K.gguf',
verbose=False
)
# You can also download the model in advance and tell llama-cpp-python to just pull it from a local file
# llm = Llama(
# model_path="relative/file/path/to/model"
# )
# Get inference end time
llm_end = timer()
print('LLM acquired!')
# Compute time taken to acquire the model
llm_elapsed_time = llm_end - llm_start
llm_mins, llm_secs = divmod(llm_elapsed_time, 60)
llm_hours, llm_mins = divmod(llm_mins, 60)
print('Starting to perform inference...')
# Get start time for inference
inference_start = timer()
# Start the inference using the high-level API provided by llama-cpp-python
output = llm.create_chat_completion(
# Define the message template for the conversation
messages=[
# Define how the system (the LLM) is to act
{
"role": "system",
"content": "You are an assistant who perfectly describes large language models imitating the speech style of pirates."
},
# Define what the user's prompt is for the LLM.
{
"role": "user",
"content": "Tell me what a LLM is."
}
]
)
# Get end time for inference
inference_end = timer()
print('Inference completed!')
# Compute time taken for inference
inference_elapsed_time = inference_end - inference_start
inference_mins, inference_secs = divmod(inference_elapsed_time, 60)
inference_hours, inference_mins = divmod(inference_mins, 60)
# Printing results
print(f"Acquiring the model took: {llm_hours:.0f} hours, {llm_mins:.0f} minutes, and {llm_secs:.0f} seconds")
print(f"Performing inference took: {inference_hours:.0f} hours, {inference_mins:.0f} minutes, and {inference_secs:.0f} seconds")
print()
print(output["choices"][0]["message"]["content"])