Spaces:
Sleeping
Sleeping
File size: 12,789 Bytes
c99bc7a a7af970 c99bc7a a7af970 7066d20 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a a7af970 c99bc7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
#!/usr/bin/env python3
"""
简洁版BERT+FAISS标语数据库
输入:产品/业务描述
输出:匹配的广告标语
"""
import numpy as np
import faiss
import json
from sentence_transformers import SentenceTransformer
from datasets import Dataset
import pandas as pd
class SloganDatabase:
def __init__(self):
self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
self.index = None
self.slogans = []
def create_dataset(self):
"""创建标语数据集 - 珠宝首饰奢侈品领域"""
# 示例数据:[品牌, 类别, 描述, 标语]
data = [
# 顶级珠宝品牌
["Tiffany & Co.", "jewelry", "luxury diamond jewelry and engagement rings", "A Diamond is Forever"],
["Cartier", "luxury_jewelry", "high-end jewelry watches and accessories", "L'art de vivre"],
["Van Cleef & Arpels", "jewelry", "French luxury jewelry and watches", "Poetry of Time"],
["Harry Winston", "jewelry", "rare diamonds and luxury jewelry", "Rare Jewels of the World"],
["Bulgari", "jewelry", "Italian luxury jewelry and watches", "Italian Excellence"],
["Chopard", "jewelry", "Swiss luxury jewelry and watches", "Happy Diamonds"],
["Graff", "jewelry", "exceptional diamonds and jewelry", "The Most Fabulous Jewels in the World"],
["Piaget", "jewelry", "Swiss luxury watches and jewelry", "Possession"],
["Boucheron", "jewelry", "French high jewelry and luxury watches", "Le Joaillier Depuis 1858"],
["Mikimoto", "jewelry", "cultured pearl jewelry", "The Originator of Cultured Pearls"],
# 奢侈品牌
["Louis Vuitton", "luxury_fashion", "luxury leather goods and fashion", "The Art of Travel"],
["Hermès", "luxury_fashion", "French luxury goods and accessories", "Luxury in the making"],
["Chanel", "luxury_fashion", "haute couture and luxury fashion", "Inside every woman there is a flower and a cat"],
["Gucci", "luxury_fashion", "Italian luxury fashion and accessories", "Quality is remembered long after price is forgotten"],
["Prada", "luxury_fashion", "Italian luxury fashion house", "Prada"],
["Dior", "luxury_fashion", "French luxury fashion and beauty", "Miss Dior"],
["Versace", "luxury_fashion", "Italian luxury fashion design", "Virtus"],
["Saint Laurent", "luxury_fashion", "French luxury fashion house", "Saint Laurent Paris"],
["Balenciaga", "luxury_fashion", "Spanish luxury fashion house", "Balenciaga"],
["Bottega Veneta", "luxury_fashion", "Italian luxury leather goods", "When your own initials are enough"],
# 腕表品牌
["Rolex", "luxury_watches", "Swiss luxury watches and timepieces", "Perpetual, Spirit of Excellence"],
["Patek Philippe", "luxury_watches", "Swiss luxury watch manufacturer", "You never actually own a Patek Philippe"],
["Audemars Piguet", "luxury_watches", "Swiss luxury watch brand", "To break the rules, you must first master them"],
["Omega", "luxury_watches", "Swiss luxury watch manufacturer", "Precision"],
["TAG Heuer", "luxury_watches", "Swiss luxury watches", "Don't crack under pressure"],
["Breitling", "luxury_watches", "Swiss luxury watchmaker", "Instruments for Professionals"],
["IWC", "luxury_watches", "Swiss luxury watch company", "Engineered for men"],
["Jaeger-LeCoultre", "luxury_watches", "Swiss luxury watch manufacturer", "The World's Most Complicated Watches"],
["Vacheron Constantin", "luxury_watches", "Swiss luxury watch manufacturer", "One of Not Many"],
["A. Lange & Söhne", "luxury_watches", "German luxury watch manufacturer", "When nothing else will do"],
# 时尚首饰
["Pandora", "fashion_jewelry", "Danish jewelry brand charm bracelets", "Be Love"],
["Swarovski", "fashion_jewelry", "Austrian crystal jewelry and accessories", "Unleash Your Light"],
["Daniel Wellington", "fashion_watches", "Swedish watch brand minimalist design", "Live the moment"],
["Alex and Ani", "fashion_jewelry", "American jewelry brand spiritual bracelets", "Positive Energy"],
["Kendra Scott", "fashion_jewelry", "American jewelry designer colorful stones", "Live colorfully"],
["Monica Vinader", "fashion_jewelry", "British jewelry brand contemporary design", "Everyday luxury"],
["Mejuri", "fashion_jewelry", "Canadian jewelry brand everyday luxury", "Everyday fine"],
["Gorjana", "fashion_jewelry", "California jewelry brand layered necklaces", "Live your layer"],
["Kate Spade", "fashion_jewelry", "American fashion accessories jewelry", "Live colorfully"],
["Marc Jacobs", "fashion_jewelry", "American fashion designer accessories", "Marc Jacobs"],
# 珠宝定制
["Blue Nile", "diamond_jewelry", "online diamond jewelry retailer", "Extraordinary diamonds for extraordinary moments"],
["James Allen", "diamond_jewelry", "online engagement ring retailer", "See it. Love it. Own it."],
["Brilliant Earth", "diamond_jewelry", "ethical diamond jewelry", "Brilliant Earth"],
["With Clarity", "diamond_jewelry", "lab-grown diamond jewelry", "Diamonds. Redefined."],
["Clean Origin", "diamond_jewelry", "lab-created diamond jewelry", "Grown with love"],
["Ritani", "diamond_jewelry", "engagement rings and wedding bands", "Love is in the details"],
["Vrai", "diamond_jewelry", "lab-grown diamond jewelry", "Created, not mined"],
["Catbird", "jewelry", "Brooklyn-based jewelry designer", "Made in Brooklyn"],
["Wwake", "jewelry", "contemporary fine jewelry designer", "Wwake"],
["Jacquie Aiche", "jewelry", "California jewelry designer bohemian luxury", "Jacquie Aiche"],
# 中国珠宝品牌
["周大福", "jewelry", "香港珠宝品牌黄金钻石", "心意足金"],
["周生生", "jewelry", "香港珠宝品牌传统工艺", "传承经典"],
["老凤祥", "jewelry", "中国传统珠宝品牌黄金首饰", "老凤祥,真金不怕火炼"],
["六福珠宝", "jewelry", "香港珠宝品牌时尚设计", "六福临门"],
["潘多拉", "jewelry", "丹麦珠宝品牌串珠手链", "表达你的故事"],
["周大生", "jewelry", "中国珠宝品牌钻石首饰", "爱就在一起"],
["金伯利", "jewelry", "中国钻石珠宝品牌", "只为更好的你"],
["戴比尔斯", "diamond_jewelry", "钻石开采珠宝品牌", "钻石恒久远,一颗永流传"],
["施华洛世奇", "crystal_jewelry", "奥地利水晶珠宝品牌", "释放你的光芒"],
["谢瑞麟", "jewelry", "香港珠宝设计师品牌", "艺术珠宝"],
# 奢侈品配饰
["Goyard", "luxury_accessories", "French luxury leather goods", "Goyard"],
["Moynat", "luxury_accessories", "French luxury leather goods", "Moynat"],
["Berluti", "luxury_accessories", "French luxury leather goods", "Berluti"],
["Valextra", "luxury_accessories", "Italian luxury leather goods", "Milanese excellence since 1937"],
["Loewe", "luxury_accessories", "Spanish luxury leather goods", "Craft"],
["Brunello Cucinelli", "luxury_fashion", "Italian luxury fashion cashmere", "Humanistic Enterprise"],
["Loro Piana", "luxury_fashion", "Italian luxury textile and clothing", "Excellence in natural fibers"],
["Kiton", "luxury_fashion", "Italian luxury menswear", "The most beautiful thing made by man"],
["Zegna", "luxury_fashion", "Italian luxury menswear", "What makes a man"],
["Brioni", "luxury_fashion", "Italian luxury menswear", "Roman style"],
# 新兴奢侈品牌
["Jacquemus", "luxury_fashion", "French luxury fashion house", "La Montagne"],
["Ganni", "luxury_fashion", "Danish fashion brand", "Ganni"],
["Staud", "luxury_fashion", "American fashion brand", "Staud"],
["Cult Gaia", "luxury_accessories", "American accessories brand", "Cult Gaia"],
["Rosantica", "jewelry", "Italian jewelry brand", "Rosantica"],
["Alighieri", "jewelry", "British jewelry brand", "The Inferno"],
["Lizzie Fortunato", "jewelry", "American jewelry brand", "Lizzie Fortunato"],
["Aurate", "jewelry", "American jewelry brand", "Accessible luxury"],
["AUrate New York", "jewelry", "New York jewelry brand", "Radically responsible luxury"],
["Missoma", "jewelry", "British jewelry brand", "Missoma"]
]
# 转换为DataFrame
df = pd.DataFrame(data, columns=['brand', 'category', 'description', 'slogan'])
# 创建搜索文本(组合描述信息)
df['search_text'] = df['brand'] + ' ' + df['category'] + ' ' + df['description']
return df.to_dict('records')
def build_index(self, data):
"""构建FAISS索引"""
print("🔨 Building FAISS index...")
# 提取搜索文本
texts = [item['search_text'] for item in data]
# 生成embeddings
embeddings = self.encoder.encode(texts, show_progress_bar=True)
# 构建索引
self.index = faiss.IndexFlatIP(384) # 使用内积相似度
self.index.add(embeddings.astype('float32'))
# 保存数据
self.slogans = data
print(f"✅ Index built with {len(data)} slogans")
def search(self, query, k=5):
"""搜索相似标语"""
if not self.index:
raise ValueError("Index not built yet!")
# 编码查询
query_embedding = self.encoder.encode([query])
# 搜索
scores, indices = self.index.search(query_embedding.astype('float32'), k)
# 返回结果
results = []
for score, idx in zip(scores[0], indices[0]):
if idx < len(self.slogans):
result = self.slogans[idx].copy()
result['similarity_score'] = float(score)
results.append(result)
return results
def save(self, path="slogan_db"):
"""保存数据库"""
# 保存FAISS索引
faiss.write_index(self.index, f"{path}.faiss")
# 保存标语数据
with open(f"{path}.json", 'w', encoding='utf-8') as f:
json.dump(self.slogans, f, ensure_ascii=False, indent=2)
print(f"💾 Database saved to {path}")
def load(self, path="slogan_db"):
"""加载数据库"""
try:
# 加载FAISS索引
self.index = faiss.read_index(f"{path}.faiss")
# 加载标语数据
with open(f"{path}.json", 'r', encoding='utf-8') as f:
self.slogans = json.load(f)
print(f"📂 Database loaded from {path}")
return True
except:
print(f"❌ Failed to load database from {path}")
return False
def main():
"""主函数"""
print("🚀 Creating Slogan Database...")
# 初始化
db = SloganDatabase()
# 尝试加载现有数据库
if not db.load():
print("📊 Creating new database...")
# 创建数据集
data = db.create_dataset()
# 构建索引
db.build_index(data)
# 保存数据库
db.save()
# 测试搜索
test_queries = [
"钻石订婚戒指",
"奢侈品手袋",
"瑞士手表品牌",
"珍珠首饰",
"黄金项链",
"时尚耳环",
"luxury jewelry brand",
"designer handbag",
"crystal accessories",
"wedding rings"
]
print("\n🔍 Testing searches...")
for query in test_queries:
print(f"\n查询: {query}")
print("-" * 40)
results = db.search(query, k=3)
for i, result in enumerate(results, 1):
print(f"{i}. {result['brand']} ({result['category']})")
print(f" 描述: {result['description']}")
print(f" 标语: {result['slogan']}")
print(f" 相似度: {result['similarity_score']:.3f}")
print()
if __name__ == "__main__":
main() |