Spaces:

FM-1976
/

Gemma2-2B-Reflection

Build error

FM-1976 commited on Sep 18, 2024

Commit

5ba8f3e

verified ·

1 Parent(s): 51eba2d

Update app.py

moved modelfile to

@cache
, add flash attention and number of threads=2

Files changed (1) hide show

app.py CHANGED Viewed

@@ -25,11 +25,6 @@ st.set_page_config(
     page_icon="🌟",
     layout="wide")
-if "modelfile" not in st.session_state:
-    st.session_state.modelfile = hf_hub_download(
-        repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
-        filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
-    )
 if "hf_model" not in st.session_state:
     st.session_state.hf_model = "Gemma2-2B-it"
@@ -79,14 +74,20 @@ def genRANstring(n):
 def create_chat():
 # Set HF API token  and HF repo
     from llama_cpp import Llama
     client = Llama(
-                model_path=st.session_state.modelfile,
                 #n_gpu_layers=-1,  #enable GPU
                 temperature=0.24,
                 n_ctx=nCTX,
                 max_tokens=600,
                 repeat_penalty=1.176,
                 stop=sTOPS,
                 verbose=verbosity,
                 )
     print('loading gemma-2-2b-it-Q5_K_M.gguf with LlamaCPP...')

     page_icon="🌟",
     layout="wide")
 if "hf_model" not in st.session_state:
     st.session_state.hf_model = "Gemma2-2B-it"
 def create_chat():
 # Set HF API token  and HF repo
     from llama_cpp import Llama
+    modelfile = hf_hub_download(
+        repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
+        filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
+    )
     client = Llama(
+                model_path=modelfile,
                 #n_gpu_layers=-1,  #enable GPU
+                n_threads =2,
                 temperature=0.24,
                 n_ctx=nCTX,
                 max_tokens=600,
                 repeat_penalty=1.176,
                 stop=sTOPS,
+                flash_attn=True,
                 verbose=verbosity,
                 )
     print('loading gemma-2-2b-it-Q5_K_M.gguf with LlamaCPP...')