OverSide88 commited on
Commit
4db9478
·
verified ·
1 Parent(s): 0cc488f

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. README.md +9 -0
  3. app.py +135 -0
  4. embeddings.txt +3 -0
  5. requirements.txt +56 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ embeddings.txt filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # ## Умный поиск книг
2
+
3
+ ## 🦸‍♂️Команда
4
+ 1. [Валерия](https://github.com/valeri2393)
5
+ 2. [Сауле]([SauleBis](https://github.com/SauleBis))
6
+ 3. [Савр](https://github.com/SavrOverSide)
7
+
8
+ ## 🎯 Задача
9
+ собрать выборку из не менее, чем 5000 аннотаций c [сайта](https://www.biblio-globus.ru/category?cid=182&pagenumber=1) и построить систему поиска наиболее подходящих под пользовательский запрос книг.
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import torch
4
+ import numpy as np
5
+ from transformers import AutoTokenizer, AutoModel
6
+ import faiss
7
+ from streamlit.errors import StreamlitAPIException
8
+ import urllib.parse
9
+
10
+
11
+
12
+ import os
13
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
14
+
15
+ # Load model and tokenizer
16
+ model_name = "sentence-transformers/msmarco-distilbert-base-v3"
17
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+ model = AutoModel.from_pretrained(model_name)
19
+
20
+ # Load data
21
+ books = pd.read_csv('data/data_final_version.csv')
22
+
23
+ MAX_LEN = 300
24
+
25
+ def embed_bert_cls(text, model=model, tokenizer=tokenizer):
26
+ t = tokenizer(text,
27
+ padding=True,
28
+ truncation=True,
29
+ return_tensors='pt',
30
+ max_length=MAX_LEN)
31
+ with torch.no_grad():
32
+ model_output = model(**{k: v.to(model.device) for k, v in t.items()})
33
+ embeddings = model_output.last_hidden_state[:, 0, :]
34
+ embeddings = torch.nn.functional.normalize(embeddings)
35
+ return embeddings[0].cpu().squeeze()
36
+
37
+ # Load embeddings
38
+ embeddings = np.loadtxt('embeddings.txt')
39
+ embeddings_tensor = [torch.tensor(embedding) for embedding in embeddings]
40
+
41
+ # Create Faiss index
42
+ embeddings_matrix = np.stack(embeddings)
43
+ index = faiss.IndexFlatIP(embeddings_matrix.shape[1])
44
+ index.add(embeddings_matrix)
45
+
46
+
47
+ # CSS стили для заднего фона
48
+ background_image = """
49
+ <style>
50
+ .stApp {
51
+ background-image: url("https://img.freepik.com/premium-photo/blur-image-book_9563-1100.jpg");
52
+ background-size: cover;
53
+ background-position: center;
54
+ background-repeat: no-repeat;
55
+ }
56
+ </style>
57
+ """
58
+
59
+ # Вставляем CSS стили в приложение Streamlit
60
+ st.markdown(background_image, unsafe_allow_html=True)
61
+
62
+
63
+ # Вставляем CSS стили для окошка с прозрачным фоном
64
+ transparent_title = """
65
+ <style>
66
+ .transparent-title {
67
+ background-color: rgba(255, 255, 255, 0.7);
68
+ padding: 10px;
69
+ border-radius: 5px;
70
+ box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1);
71
+ }
72
+ </style>
73
+ """
74
+
75
+ transparent_box = """
76
+ <style>
77
+ .transparent-box {
78
+ background-color: rgba(255, 255, 255, 0.7);
79
+ padding: 10px;
80
+ border-radius: 5px;
81
+ box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1);
82
+ }
83
+ </style>
84
+ """
85
+
86
+ # Вставляем CSS стили в приложение Streamlit
87
+ st.markdown(transparent_title, unsafe_allow_html=True)
88
+ st.markdown(transparent_box, unsafe_allow_html=True)
89
+
90
+ # Streamlit interface
91
+ st.markdown('<h1 class="transparent-title">🎓📚Приложение для рекомендаций книг📚🎓</h1>', unsafe_allow_html=True)
92
+
93
+ # Далее ваш код Streamlit
94
+ text = st.text_input('Введите ваш запрос для поиска книг:')
95
+ num_results = st.number_input('Количество результатов:', min_value=1, max_value=20, value=3)
96
+ recommend_button = st.button('Получить рекомендации')
97
+
98
+
99
+ if text and recommend_button: # Check if the user entered text and clicked the button
100
+
101
+ # Embed the query and search for nearest vectors using Faiss
102
+ query_embedding = embed_bert_cls(text)
103
+ query_embedding = query_embedding.numpy().astype('float32')
104
+ _, indices = index.search(np.expand_dims(query_embedding, axis=0), num_results)
105
+
106
+ st.subheader('Рекомендации по вашему запросу:')
107
+ for i in indices[0]:
108
+ recommended_embedding = embeddings_tensor[i].numpy() # Vector of the recommended book
109
+ similarity = np.dot(query_embedding, recommended_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(recommended_embedding)) # Cosine similarity
110
+ similarity_percent = similarity * 100
111
+
112
+ col1, col2 = st.columns([1, 3])
113
+ with col1:
114
+ image_url = books['image_url'][i]
115
+ if pd.isna(image_url) or not image_url or image_url.strip() == '':
116
+ st.write("Обложка не найдена")
117
+ else:
118
+ try:
119
+ st.image(image_url, use_column_width=True)
120
+ except Exception as e:
121
+ st.write("Обложка не найдена")
122
+ st.write(e)
123
+
124
+ with col2:
125
+ # Выводим информацию о книге на прозрачном фоне
126
+ st.markdown(f"""
127
+ <div class="transparent-box">
128
+ <p><b>Название книги:</b> {books['title'][i]}</p>
129
+ <p><b>Автор:</b> {books['author'][i]}</p>
130
+ <p><b>Описание:</b>{books['annotation'][i]}")
131
+ <p><b>Оценка сходства:</b> {similarity_percent:.2f}%</p>
132
+ </div>
133
+ """, unsafe_allow_html=True)
134
+
135
+ st.write("---")
embeddings.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72371c6be3b7ad73af2f3dcc03c00c1f86d1b341385b0778156f8b8a83d3977c
3
+ size 30783658
requirements.txt ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.3.0
2
+ attrs==23.2.0
3
+ blinker==1.8.2
4
+ cachetools==5.3.3
5
+ certifi==2024.7.4
6
+ charset-normalizer==3.3.2
7
+ click==8.1.7
8
+ faiss-cpu==1.8.0.post1
9
+ filelock==3.15.4
10
+ fsspec==2024.6.1
11
+ gitdb==4.0.11
12
+ GitPython==3.1.43
13
+ huggingface-hub==0.23.4
14
+ idna==3.7
15
+ Jinja2==3.1.4
16
+ jsonschema==4.23.0
17
+ jsonschema-specifications==2023.12.1
18
+ markdown-it-py==3.0.0
19
+ MarkupSafe==2.1.5
20
+ mdurl==0.1.2
21
+ mpmath==1.3.0
22
+ networkx==3.3
23
+ numpy==1.26.4
24
+ packaging==24.1
25
+ pandas==2.2.2
26
+ pillow==10.4.0
27
+ protobuf==5.27.2
28
+ pyarrow==16.1.0
29
+ pydeck==0.9.1
30
+ Pygments==2.18.0
31
+ python-dateutil==2.9.0.post0
32
+ pytz==2024.1
33
+ PyYAML==6.0.1
34
+ referencing==0.35.1
35
+ regex==2024.5.15
36
+ requests==2.32.3
37
+ rich==13.7.1
38
+ rpds-py==0.19.0
39
+ safetensors==0.4.3
40
+ six==1.16.0
41
+ smmap==5.0.1
42
+ streamlit==1.36.0
43
+ sympy==1.13.0
44
+ tenacity==8.5.0
45
+ tokenizers==0.19.1
46
+ toml==0.10.2
47
+ toolz==0.12.1
48
+ torch==2.3.1
49
+ torchaudio==2.3.1
50
+ torchvision==0.18.1
51
+ tornado==6.4.1
52
+ tqdm==4.66.4
53
+ transformers==4.42.4
54
+ typing_extensions==4.12.2
55
+ tzdata==2024.1
56
+ urllib3==2.2.2