Spaces:

ubermenchh
/

arxiv-retrieval

Build error

App Files Files Community

arxiv-retrieval / app.py

ubermenchh

Update app.py

f610abe about 2 years ago

raw

history blame contribute delete

3.6 kB

	from langchain.embeddings.huggingface import HuggingFaceEmbeddings
	import os, pinecone, time, transformers
	from datasets import load_dataset
	from torch import bfloat16
	from langchain.llms import HuggingFacePipeline
	from langchain.vectorstores import Pinecone
	from langchain.chains import RetrievalQA
	import ctransformers

	embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
	device = 'cpu'

	embed_model = HuggingFaceEmbeddings(
	model_name=embed_model_id,
	model_kwargs={'device': device},
	encode_kwargs={'device': device, 'batch_size': 32}
	)

	docs = [
	'This is a document',
	'and another document'
	]
	embeddings = embed_model.embed_documents(docs)

	api_key = os.environ.get('PINECONE_API_KEY')
	env_name = os.environ.get('PINECONE_ENV')

	pinecone.init(
	api_key=api_key,
	environment=env_name
	)

	index_name = 'llama-2-rag'

	if index_name not in pinecone.list_indexes():
	pinecone.create_index(
	index_name,
	dimension=len(embeddings[0]),
	metric='cosine'
	)
	while not pinecone.describe_index(index_name).status['ready']:
	time.sleep(1)

	index = pinecone.Index(index_name)

	data = load_dataset('jamescalam/llama-2-arxiv-papers-chunked', split='train')
	data = data.to_pandas()
	batch_size = 32

	for i in range(0, len(data), batch_size):
	i_end = min(len(data), i+batch_size)
	batch = data.iloc[i:i_end]
	ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
	texts = [x['chunk'] for i, x in batch.iterrows()]
	embeds = embed_model.embed_documents(texts)
	metadata = [
	{'text': x['chunk'],
	'source': x['source'],
	'title': x['title']} for i, x in batch.iterrows()
	]
	index.upsert(vectors=zip(ids, embeds, metadata))

	#model_id = "TheBloke/Llama-2-7B-GGML"
	#model_id = "TheBloke/Llama-2-7B-chat-GGML"
	#model_id = "TheBloke/Llama-2-13B-GGML"
	model_id = "TheBloke/Llama-2-13B-chat-GGML"
	hf_auth = os.environ.get('HF_AUTH_KEY')

	# bnb_config = transformers.BitsAndBytesConfig(
	# load_in_4bit=True,
	# bnb_4bit_quant_type='nf4',
	# bnb_4bit_use_double_quant=True,
	# bnb_4bit_compute_dtype=bfloat16,
	# )
	# model_config = transformers.AutoConfig.from_pretrained(model_id, use_auth_token=hf_auth)
	# model = transformers.AutoModelForCausalLM.from_pretrained(
	# model_id,
	# trust_remote_code=True,
	# config=model_config,
	# quantization_config=bnb_config,
	# device_map='auto',
	# use_auth_token=hf_auth
	# )
	# model.eval()

	# tokenizer = transformer.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)

	## Using GGML Llama

	config = {
	'max_new_tokens': 512,
	'repetition_penalty': 1.1,
	'temperature': 0.3,
	'stream': True
	}
	model = ctransformers.AutoModelForCausalLM.from_pretrained(
	model_id,
	model_type='llama',
	gpu_layers=130, # 110 for 7b, 130 for 13b
	hf=True,
	**config
	)
	tokenizer = ctransformers.AutoTokenizer.from_pretrained(model)

	generate_text = transformers.pipeline(
	model=model,
	tokenizer=tokenizer,
	return_full_text=True,
	task='text-generation',
	temperature=0.3,
	max_new_tokens=512,
	repetition_penalty=1.1
	)
	llm = HuggingFacePipeline(pipeline=generate_text)
	text_field = 'text'
	vectorstore = Pinecone(index, embed_model.embed_query, text_field)
	rag_pipeline = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type='stuff',
	retriever=vectorstore.as_retriever()
	)

	title = 'arxiv-retrieval'

	def predict(input):
	return rag_pipeline(input)['result']

	gr.Interface(
	fn=predict,
	inputs=['text', 'state'],
	outputs=['chatbot', 'state']
	).launch()