Spaces:
Build error
Build error
Update app.py
Browse filesmoved modelfile to
@cache
, add flash attention and number of threads=2
app.py
CHANGED
|
@@ -25,11 +25,6 @@ st.set_page_config(
|
|
| 25 |
page_icon="π",
|
| 26 |
layout="wide")
|
| 27 |
|
| 28 |
-
if "modelfile" not in st.session_state:
|
| 29 |
-
st.session_state.modelfile = hf_hub_download(
|
| 30 |
-
repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
|
| 31 |
-
filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
|
| 32 |
-
)
|
| 33 |
|
| 34 |
if "hf_model" not in st.session_state:
|
| 35 |
st.session_state.hf_model = "Gemma2-2B-it"
|
|
@@ -79,14 +74,20 @@ def genRANstring(n):
|
|
| 79 |
def create_chat():
|
| 80 |
# Set HF API token and HF repo
|
| 81 |
from llama_cpp import Llama
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
client = Llama(
|
| 83 |
-
model_path=
|
| 84 |
#n_gpu_layers=-1, #enable GPU
|
|
|
|
| 85 |
temperature=0.24,
|
| 86 |
n_ctx=nCTX,
|
| 87 |
max_tokens=600,
|
| 88 |
repeat_penalty=1.176,
|
| 89 |
stop=sTOPS,
|
|
|
|
| 90 |
verbose=verbosity,
|
| 91 |
)
|
| 92 |
print('loading gemma-2-2b-it-Q5_K_M.gguf with LlamaCPP...')
|
|
|
|
| 25 |
page_icon="π",
|
| 26 |
layout="wide")
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
if "hf_model" not in st.session_state:
|
| 30 |
st.session_state.hf_model = "Gemma2-2B-it"
|
|
|
|
| 74 |
def create_chat():
|
| 75 |
# Set HF API token and HF repo
|
| 76 |
from llama_cpp import Llama
|
| 77 |
+
modelfile = hf_hub_download(
|
| 78 |
+
repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
|
| 79 |
+
filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
|
| 80 |
+
)
|
| 81 |
client = Llama(
|
| 82 |
+
model_path=modelfile,
|
| 83 |
#n_gpu_layers=-1, #enable GPU
|
| 84 |
+
n_threads =2,
|
| 85 |
temperature=0.24,
|
| 86 |
n_ctx=nCTX,
|
| 87 |
max_tokens=600,
|
| 88 |
repeat_penalty=1.176,
|
| 89 |
stop=sTOPS,
|
| 90 |
+
flash_attn=True,
|
| 91 |
verbose=verbosity,
|
| 92 |
)
|
| 93 |
print('loading gemma-2-2b-it-Q5_K_M.gguf with LlamaCPP...')
|