From 230f7f7ced61668de022a295ccaf5598e6bd1b29 Mon Sep 17 00:00:00 2001
From: Adam Bouhenguel <adam@bouhenguel.com>
Date: Mon, 4 Dec 2023 20:13:59 -0500
Subject: [PATCH] llamafile 0.2.1, model images use server

---
 Dockerfile         | 30 +++++++++++++++---------------
 README.md          | 30 +++++++++++++++++-------------
 docker-compose.yml | 40 +++++++++++++++++++++++++++++++---------
 3 files changed, 63 insertions(+), 37 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8b35ed6..3705a44 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,17 +5,17 @@ FROM busybox as busybox
 # define a starting point "scratch" image that can run APEs
 FROM scratch as cosmos-scratch
 COPY --from=busybox /bin/uname /usr/bin/
-ADD --chmod=0755 --checksum=sha256:349f3f511c4eb70c4db52e2fb99a41d9b208c83c3ec682c057ebaf1fe5f9857b https://cosmo.zip/pub/cosmos/bin/assimilate-x86_64.elf /usr/bin/
-ADD --chmod=0755 --checksum=sha256:7b6f27e3997be53afc70717e0d7dea35eea799987224380bccc176b494996d0f https://cosmo.zip/pub/cosmos/bin/dash /bin/sh
+ADD --chmod=0755 --checksum=sha256:349f3f511c4eb70c4db52e2fb99a41d9b208c83c3ec682c057ebaf1fe5f9857b https://cosmo.zip/pub/cosmos/v/3.1.3/bin/assimilate-x86_64.elf /usr/bin/
+ADD --chmod=0755 --checksum=sha256:7b6f27e3997be53afc70717e0d7dea35eea799987224380bccc176b494996d0f https://cosmo.zip/pub/cosmos/v/3.1.3/bin/dash /bin/sh
 RUN ["/usr/bin/assimilate-x86_64.elf", "-c", "/bin/sh"]
-ADD --checksum=sha256:abf3b1bb7182935bf48d98dc143c51ee563d29a1fd2c3930ff5a8d8c8d823817 --chmod=0755 https://justine.lol/ape.elf /usr/bin/ape
+ADD --checksum=sha256:abf3b1bb7182935bf48d98dc143c51ee563d29a1fd2c3930ff5a8d8c8d823817 --chmod=0755 https://cosmo.zip/pub/cosmos/v/3.1.3/bin/ape-x86_64.elf /usr/bin/ape
 ENV PATH=/bin:/usr/bin
 
 # download and unpack all the cosmos binaries
 FROM cosmos-scratch as unpack-cosmos
-ADD --chmod=0755 --checksum=sha256:48e33306662ff052b21bb84e4b03779d94127727758cfc43d1551ea05d44ee3d https://cosmo.zip/pub/cosmos/bin/unzip /usr/bin/
+ADD --chmod=0755 --checksum=sha256:d9bf928f1aa32e3588087337cb04568b3284fc678b079612e7b74f72f01c0913 https://cosmo.zip/pub/cosmos/v/3.1.3/bin/unzip /usr/bin/
 RUN ["/usr/bin/assimilate-x86_64.elf", "-c", "/usr/bin/unzip"]
-ADD  --checksum=sha256:241dc90f3e92b22c9e08cfb5f6df2e920da258e3c461d9677f267ab7a6dff2fd https://cosmo.zip/pub/cosmos/zip/cosmos.zip /dl/
+ADD  --checksum=sha256:e280987f99f8c9802fa5564adbdc08ec7a8f8738c16a75cf74fb820cc1c14981 https://cosmo.zip/pub/cosmos/zip/cosmos-3.1.3.zip /dl/cosmos.zip
 
 # list of binaries that must be assimilated and manifest for /bin as described in https://justine.lol/cosmos.txt (as of 2023-11-29)
 WORKDIR /opt/cosmos
@@ -108,14 +108,17 @@ ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\"", "sh", "/usr/bin/llamafile"]
 
 FROM cosmos-scratch as llamafile-gguf
 LABEL org.opencontainers.image.source https://github.com/ajbouh/cosmos
-ADD --checksum=sha256:dc538ce8721bb84ad3a9f683757ce7a227e61bf2c6e092c4014838fe198c41cc --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.1/llamafile-main-0.1 /usr/bin/llamafile-main
+ADD --checksum=sha256:c7151d48677e352e492731bd999d9d74c792fa1440715a858dbf3b92ee274abe --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-main-0.2.1 /usr/bin/llamafile-main
+ADD --checksum=sha256:2b3c692e50d903cbf6ac3d8908f8394101b5be5f8a4573b472975fa8c9f09e68 --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-server-0.2.1 /usr/bin/llamafile-server
 ARG GGUF_URL
 ARG GGUF_CHECKSUM
 ADD --checksum=${GGUF_CHECKSUM} --chmod=0755 ${GGUF_URL} /model.gguf
-ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\"", "sh", "/usr/bin/llamafile-main", "-m", "/model.gguf"]
+EXPOSE 8080
+ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\"", "sh", "/usr/bin/llamafile-server", "-m", "/model.gguf", "--port", "8080", "--host", "0.0.0.0", "--nobrowser"]
 
 FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 as devel-llamafile
-ADD --checksum=sha256:dc538ce8721bb84ad3a9f683757ce7a227e61bf2c6e092c4014838fe198c41cc --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.1/llamafile-main-0.1 /usr/bin/llamafile-main
+ADD --checksum=sha256:c7151d48677e352e492731bd999d9d74c792fa1440715a858dbf3b92ee274abe --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-main-0.2.1 /usr/bin/llamafile-main
+ADD --checksum=sha256:2b3c692e50d903cbf6ac3d8908f8394101b5be5f8a4573b472975fa8c9f09e68 --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-server-0.2.1 /usr/bin/llamafile-server
 # HACK we need to assimilate so this can run on github actions...
 COPY --from=unpack-cosmos /usr/bin/assimilate /usr/bin/
 RUN /usr/bin/assimilate -c /usr/bin/llamafile-main
@@ -134,11 +137,6 @@ COPY --from=devel-llamafile /root/.llamafile /root/.llamafile
 ENV PATH=/bin:/usr/bin
 ENV HOME=/root
 ENV LD_LIBRARY_PATH=/usr/local/cuda/targets/x86_64-linux/lib:/lib:/lib64
-# HACK forge an executable nvcc, because llamafile needs to find nvcc before looking for cached .cosmo and .llamafile files
-COPY --from=unpack-cosmos /bin/chmod /bin/
-WORKDIR /usr/local/cuda/bin/
-RUN printf "" >nvcc
-RUN chmod 0755 nvcc
 # HACK things seem to fail if we have multiple CUDA devices. limit ourselves to one device for now to avoid errors like:
 # >  CUDA error 2 at /root/.llamafile/ggml-cuda.cu:7864: out of memory
 # >  current device: 4
@@ -155,10 +153,12 @@ ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\" --n-gpu-layers $LLAMAFILE_N_GPU_LAYERS
 
 FROM llamafile-cuda-scratch as llamafile-gguf-cuda
 LABEL org.opencontainers.image.source https://github.com/ajbouh/cosmos
-ADD --checksum=sha256:dc538ce8721bb84ad3a9f683757ce7a227e61bf2c6e092c4014838fe198c41cc --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.1/llamafile-main-0.1 /usr/bin/llamafile-main
+ADD --checksum=sha256:c7151d48677e352e492731bd999d9d74c792fa1440715a858dbf3b92ee274abe --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-main-0.2.1 /usr/bin/llamafile-main
+ADD --checksum=sha256:2b3c692e50d903cbf6ac3d8908f8394101b5be5f8a4573b472975fa8c9f09e68 --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.2.1/llamafile-server-0.2.1 /usr/bin/llamafile-server
 ARG GGUF_URL
 ARG GGUF_CHECKSUM
 ADD --checksum=${GGUF_CHECKSUM} --chmod=0755 ${GGUF_URL} /model.gguf
 ARG LLAMAFILE_N_GPU_LAYERS=35
 ENV LLAMAFILE_N_GPU_LAYERS=${LLAMAFILE_N_GPU_LAYERS}
-ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\" --n-gpu-layers $LLAMAFILE_N_GPU_LAYERS", "sh", "/usr/bin/llamafile-main", "-m", "/model.gguf"]
+EXPOSE 8080
+ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\" --n-gpu-layers $LLAMAFILE_N_GPU_LAYERS", "sh", "/usr/bin/llamafile-server", "-m", "/model.gguf", "--port", "8080", "--host", "0.0.0.0", "--nobrowser"]
diff --git a/README.md b/README.md
index 6eda8b6..99b2534 100644
--- a/README.md
+++ b/README.md
@@ -2,19 +2,21 @@
 
 To try out a container built by this repo (without needing to do a git clone), you can:
 ```
-docker run --rm -it ghcr.io/ajbouh/cosmos:3.1.1
-docker run --rm -it ghcr.io/ajbouh/cosmos:python-cosmo-3.1.1
-docker run --rm -it ghcr.io/ajbouh/cosmos:lua-cosmo-3.1.1
-docker run --rm -it ghcr.io/ajbouh/cosmos:sqlite3-cosmo-3.1.1
-docker run --rm -it ghcr.io/ajbouh/cosmos:qjs-cosmo-3.1.1
-docker run --rm -it ghcr.io/ajbouh/cosmos:mistral-7b-instruct-v0.1-q4_k_m-cosmo-3.1.1
-docker run --rm -it ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cosmo-3.1.1
+docker run --rm -it ghcr.io/ajbouh/cosmos:3.1.3
+docker run --rm -it ghcr.io/ajbouh/cosmos:python-cosmo-3.1.3
+docker run --rm -it ghcr.io/ajbouh/cosmos:lua-cosmo-3.1.3
+docker run --rm -it ghcr.io/ajbouh/cosmos:sqlite3-cosmo-3.1.3
+docker run --rm -it ghcr.io/ajbouh/cosmos:qjs-cosmo-3.1.3
+docker run --rm -it ghcr.io/ajbouh/cosmos:mistral-7b-instruct-v0.1-q4_k_m-cosmo-3.1.3
+docker run --rm -it ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cosmo-3.1.3
+docker run --rm -it ghcr.io/ajbouh/cosmos:airoboros-m-7b-3.1.2-dare-0.85.q4_k_m-cosmo-3.1.3
 ```
 
 If you have a GPU and your docker daemon has been configured to make use of it, try one of these commands:
 ```
-docker run --rm -it --gpus all ghcr.io/ajbouh/cosmos:mistral-7b-instruct-v0.1-q4_k_m-cuda-12.1.1-cosmo-3.1.1
-docker run --rm -it --gpus all ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cuda-12.1.1-cosmo-3.1.1
+docker run --rm -it --gpus all ghcr.io/ajbouh/cosmos:mistral-7b-instruct-v0.1-q4_k_m-cuda-12.1.1-cosmo-3.1.3
+docker run --rm -it --gpus all ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cuda-12.1.1-cosmo-3.1.3
+docker run --rm -it --gpus all ghcr.io/ajbouh/cosmos:airoboros-m-7b-3.1.2-dare-0.85.q4_k_m-cuda-12.1.1-cosmo-3.1.3
 ```
 
 To build and run one of these, first clone the project and then try one of these docker commands:
@@ -24,9 +26,11 @@ docker compose run --build --rm -it python
 docker compose run --build --rm -it lua
 docker compose run --build --rm -it sqlite3
 docker compose run --build --rm -it qjs
-docker compose run --build --rm -it mistral-7b-instruct-v0.1-q4_k_m-cuda
-docker compose run --build --rm -it mistral-7b-instruct-v0.1-q4_k_m
-docker compose run --build --rm -it llava-v1.5-7b-q4_k-cuda
-docker compose run --build --rm -it llava-v1.5-7b-q4_k
+docker compose run --build --service-ports --rm -it mistral-7b-instruct-v0.1-q4_k_m-cuda
+docker compose run --build --service-ports --rm -it mistral-7b-instruct-v0.1-q4_k_m
+docker compose run --build --service-ports --rm -it llava-v1.5-7b-q4_k-cuda
+docker compose run --build --service-ports --rm -it llava-v1.5-7b-q4_k
+docker compose run --build --service-ports --rm -it airoboros-m-7b-3.1.2-dare-0.85.q4_k_m-cuda
+docker compose run --build --service-ports --rm -it airoboros-m-7b-3.1.2-dare-0.85.q4_k_m
 ```
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 9164113..430bca6 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,40 +1,40 @@
 version: "3.8"
 services:
   cosmos:
-    image: ghcr.io/ajbouh/cosmos:3.1.1
+    image: ghcr.io/ajbouh/cosmos:3.1.3
     build:
       dockerfile: Dockerfile
       target: cosmos
   python:
-    image: ghcr.io/ajbouh/cosmos:python-cosmo-3.1.1
+    image: ghcr.io/ajbouh/cosmos:python-cosmo-3.1.3
     build:
       dockerfile: Dockerfile
       target: ape
       args:
         COSMOS_EXE: /usr/bin/python
   lua:
-    image: ghcr.io/ajbouh/cosmos:lua-cosmo-3.1.1
+    image: ghcr.io/ajbouh/cosmos:lua-cosmo-3.1.3
     build:
       dockerfile: Dockerfile
       target: ape
       args:
         COSMOS_EXE: /usr/bin/lua
   sqlite3:
-    image: ghcr.io/ajbouh/cosmos:sqlite3-cosmo-3.1.1
+    image: ghcr.io/ajbouh/cosmos:sqlite3-cosmo-3.1.3
     build:
       dockerfile: Dockerfile
       target: ape
       args:
         COSMOS_EXE: /usr/bin/sqlite3
   qjs:
-    image: ghcr.io/ajbouh/cosmos:qjs-cosmo-3.1.1
+    image: ghcr.io/ajbouh/cosmos:qjs-cosmo-3.1.3
     build:
       dockerfile: Dockerfile
       target: ape
       args:
         COSMOS_EXE: /usr/bin/qjs
   mistral-7b-instruct-v0.1-q4_k_m-cuda:
-    image: ghcr.io/ajbouh/cosmos:mistral-7b-instruct-v0.1-q4_k_m-cuda-12.1.1-cosmo-3.1.1
+    image: ghcr.io/ajbouh/cosmos:mistral-7b-instruct-v0.1-q4_k_m-cuda-12.1.1-cosmo-3.1.3
     deploy: {resources: {reservations: {devices: [{driver: nvidia, count: all, capabilities: ["gpu"]}]}}}
     build:
       dockerfile: Dockerfile
@@ -44,7 +44,7 @@ services:
         LLAMAFILE_CHECKSUM: sha256:c8d34c244e01a91df1e8b22196dfddb9662f6b08fbcd4a23609d7b736b56f4ae
         LLAMAFILE_N_GPU_LAYERS: 35
   mistral-7b-instruct-v0.1-q4_k_m:
-    image: ghcr.io/ajbouh/cosmos:mistral-7b-instruct-v0.1-q4_k_m-cosmo-3.1.1
+    image: ghcr.io/ajbouh/cosmos:mistral-7b-instruct-v0.1-q4_k_m-cosmo-3.1.3
     build:
       dockerfile: Dockerfile
       target: llamafile
@@ -52,8 +52,9 @@ services:
         LLAMAFILE_URL: https://huggingface.co/jartine/mistral-7b.llamafile/resolve/main/mistral-7b-instruct-v0.1-Q4_K_M-main.llamafile?download=true
         LLAMAFILE_CHECKSUM: sha256:c8d34c244e01a91df1e8b22196dfddb9662f6b08fbcd4a23609d7b736b56f4ae
   llava-v1.5-7b-q4_k-cuda:
-    image: ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cuda-12.1.1-cosmo-3.1.1
+    image: ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cuda-12.1.1-cosmo-3.1.3
     deploy: {resources: {reservations: {devices: [{driver: nvidia, count: all, capabilities: ["gpu"]}]}}}
+    ports: ["8080:8080"]
     build:
       dockerfile: Dockerfile
       target: llamafile-gguf-cuda
@@ -62,10 +63,31 @@ services:
         GGUF_CHECKSUM: sha256:c91ebf0a628ceb25e374df23ad966cc1bf1514b33fecf4f0073f9619dec5b3f9
         LLAMAFILE_N_GPU_LAYERS: 35
   llava-v1.5-7b-q4_k:
-    image: ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cosmo-3.1.1
+    image: ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cosmo-3.1.3
+    ports: ["8080:8080"]
     build:
       dockerfile: Dockerfile
       target: llamafile-gguf
       args:
         GGUF_URL: https://huggingface.co/jartine/llava-v1.5-7B-GGUF/resolve/main/llava-v1.5-7b-Q4_K.gguf?download=true
         GGUF_CHECKSUM: sha256:c91ebf0a628ceb25e374df23ad966cc1bf1514b33fecf4f0073f9619dec5b3f9
+  airoboros-m-7b-3.1.2-dare-0.85.q4_k_m-cuda:
+    image: ghcr.io/ajbouh/cosmos:airoboros-m-7b-3.1.2-dare-0.85.q4_k_m-cosmo-3.1.3
+    deploy: {resources: {reservations: {devices: [{driver: nvidia, count: all, capabilities: ["gpu"]}]}}}
+    ports: ["8080:8080"]
+    build:
+      dockerfile: Dockerfile
+      target: llamafile-gguf-cuda
+      args:
+        GGUF_URL: https://huggingface.co/TheBloke/airoboros-m-7B-3.1.2-dare-0.85-GGUF/resolve/main/airoboros-m-7b-3.1.2-dare-0.85.Q4_K_M.gguf?download=true
+        GGUF_CHECKSUM: sha256:5d6bc74b99aa89d3c35c90c74d6844e1e45bd810dd08f9f55252f74ed87b0663
+        LLAMAFILE_N_GPU_LAYERS: 35
+  airoboros-m-7b-3.1.2-dare-0.85.q4_k_m:
+    image: ghcr.io/ajbouh/cosmos:airoboros-m-7b-3.1.2-dare-0.85.q4_k_m-cosmo-3.1.3
+    ports: ["8080:8080"]
+    build:
+      dockerfile: Dockerfile
+      target: llamafile-gguf
+      args:
+        GGUF_URL: https://huggingface.co/TheBloke/airoboros-m-7B-3.1.2-dare-0.85-GGUF/resolve/main/airoboros-m-7b-3.1.2-dare-0.85.Q4_K_M.gguf?download=true
+        GGUF_CHECKSUM: sha256:5d6bc74b99aa89d3c35c90c74d6844e1e45bd810dd08f9f55252f74ed87b0663