Update app.py
Browse files
app.py
CHANGED
|
@@ -1418,6 +1418,8 @@ class AdvancedAIAnalyzer:
|
|
| 1418 |
|
| 1419 |
def fetch_aitimes_news(self) -> List[Dict]:
|
| 1420 |
"""AI Times์์ ์ค๋ ๋ ์ง ๋ด์ค ํฌ๋กค๋ง"""
|
|
|
|
|
|
|
| 1421 |
print("๐ฐ AI Times ๋ด์ค ์์ง ์ค...")
|
| 1422 |
|
| 1423 |
# ์์งํ URL ๋ชฉ๋ก
|
|
@@ -1438,112 +1440,63 @@ class AdvancedAIAnalyzer:
|
|
| 1438 |
response.raise_for_status()
|
| 1439 |
response.encoding = 'utf-8'
|
| 1440 |
|
| 1441 |
-
|
| 1442 |
-
|
| 1443 |
-
# ๋ด์ค ๊ธฐ์ฌ ์ฐพ๊ธฐ - ๋ค์ํ ํจํด ์๋
|
| 1444 |
-
articles = []
|
| 1445 |
-
|
| 1446 |
-
# ํจํด 1: article-list-content
|
| 1447 |
-
articles.extend(soup.find_all('div', class_='article-list-content'))
|
| 1448 |
|
| 1449 |
-
#
|
| 1450 |
-
|
|
|
|
|
|
|
| 1451 |
|
| 1452 |
-
#
|
| 1453 |
-
|
| 1454 |
|
| 1455 |
-
|
| 1456 |
-
if not articles:
|
| 1457 |
-
articles.extend(soup.find_all('article'))
|
| 1458 |
|
| 1459 |
-
|
| 1460 |
-
|
| 1461 |
-
articles.extend(soup.find_all('li'))
|
| 1462 |
-
|
| 1463 |
-
print(f" โ {len(articles)}๊ฐ ๊ธฐ์ฌ ๋ธ๋ก ๋ฐ๊ฒฌ")
|
| 1464 |
-
|
| 1465 |
-
for article in articles:
|
| 1466 |
try:
|
| 1467 |
-
|
| 1468 |
-
|
| 1469 |
-
link = None
|
| 1470 |
-
|
| 1471 |
-
# a ํ๊ทธ์์ ์ฐพ๊ธฐ
|
| 1472 |
-
a_tag = article.find('a')
|
| 1473 |
-
if a_tag:
|
| 1474 |
-
title_tag = a_tag
|
| 1475 |
-
link = a_tag.get('href', '')
|
| 1476 |
|
| 1477 |
-
#
|
| 1478 |
-
if
|
| 1479 |
-
strong_tag = title_tag.find('strong')
|
| 1480 |
-
if strong_tag:
|
| 1481 |
-
title = strong_tag.get_text(strip=True)
|
| 1482 |
-
else:
|
| 1483 |
-
title = title_tag.get_text(strip=True)
|
| 1484 |
-
else:
|
| 1485 |
continue
|
| 1486 |
|
| 1487 |
-
# ๋งํฌ
|
| 1488 |
-
|
| 1489 |
-
|
| 1490 |
-
link = 'https://www.aitimes.com' + link
|
| 1491 |
-
else:
|
| 1492 |
-
link = 'https://www.aitimes.com/' + link
|
| 1493 |
|
| 1494 |
-
# ๋ ์ง
|
| 1495 |
-
|
| 1496 |
-
|
| 1497 |
-
{'class': 'list-date'},
|
| 1498 |
-
{'class': 'date'},
|
| 1499 |
-
{'class': 'list-dated'},
|
| 1500 |
-
{'class': 'byline'},
|
| 1501 |
-
{'class': 'info'}
|
| 1502 |
-
]
|
| 1503 |
|
| 1504 |
-
|
| 1505 |
-
date_tag = article.find('span', pattern) or article.find('p', pattern)
|
| 1506 |
-
if date_tag:
|
| 1507 |
-
date_text = date_tag.get_text(strip=True)
|
| 1508 |
-
break
|
| 1509 |
|
| 1510 |
-
#
|
| 1511 |
-
if not date_text:
|
| 1512 |
-
text = article.get_text()
|
| 1513 |
-
import re
|
| 1514 |
-
date_match = re.search(r'\d{2}-\d{2}\s+\d{2}:\d{2}', text)
|
| 1515 |
-
if date_match:
|
| 1516 |
-
date_text = date_match.group()
|
| 1517 |
-
|
| 1518 |
-
# ์ ํจ์ฑ ๊ฒ์ฌ
|
| 1519 |
-
if not title or len(title) < 10:
|
| 1520 |
-
continue
|
| 1521 |
-
|
| 1522 |
-
if not link or 'javascript:' in link:
|
| 1523 |
-
continue
|
| 1524 |
-
|
| 1525 |
-
# ์ค๋ ๋ ์ง ํํฐ๋ง (๋ ์ง๊ฐ ์์ผ๋ฉด ์ผ๋จ ํฌํจ)
|
| 1526 |
-
if date_text and today not in date_text:
|
| 1527 |
continue
|
| 1528 |
|
| 1529 |
news_item = {
|
| 1530 |
'title': title,
|
| 1531 |
'url': link,
|
| 1532 |
-
'date': date_text
|
| 1533 |
'source': 'AI Times',
|
| 1534 |
'category': 'AI'
|
| 1535 |
}
|
| 1536 |
|
| 1537 |
all_news.append(news_item)
|
| 1538 |
-
|
|
|
|
|
|
|
| 1539 |
|
| 1540 |
except Exception as e:
|
| 1541 |
continue
|
| 1542 |
|
|
|
|
| 1543 |
time.sleep(1) # ์๋ฒ ๋ถํ ๋ฐฉ์ง
|
| 1544 |
|
| 1545 |
except Exception as e:
|
| 1546 |
-
print(f" โ ๏ธ URL ์์ง ์ค๋ฅ: {e}")
|
| 1547 |
continue
|
| 1548 |
|
| 1549 |
# ์ค๋ณต ์ ๊ฑฐ (URL ๊ธฐ์ค)
|
|
@@ -1554,11 +1507,11 @@ class AdvancedAIAnalyzer:
|
|
| 1554 |
unique_news.append(news)
|
| 1555 |
seen_urls.add(news['url'])
|
| 1556 |
|
| 1557 |
-
print(f"
|
| 1558 |
|
| 1559 |
# ์ต์ 3๊ฐ๋ ๋ณด์ฅ (์์ผ๋ฉด ์ํ ์ถ๊ฐ)
|
| 1560 |
if len(unique_news) < 3:
|
| 1561 |
-
print("โ ๏ธ ๋ด์ค๊ฐ ๋ถ์กฑํ์ฌ ์ต๊ทผ ์ํ
|
| 1562 |
sample_news = [
|
| 1563 |
{
|
| 1564 |
'title': 'MS "์ฑGPT ์์ ํญ์ฆ์ผ๋ก ๋ฐ์ดํฐ์ผํฐ ๋ถ์กฑ...2026๋
๊น์ง ์ง์"',
|
|
|
|
| 1418 |
|
| 1419 |
def fetch_aitimes_news(self) -> List[Dict]:
|
| 1420 |
"""AI Times์์ ์ค๋ ๋ ์ง ๋ด์ค ํฌ๋กค๋ง"""
|
| 1421 |
+
import re
|
| 1422 |
+
|
| 1423 |
print("๐ฐ AI Times ๋ด์ค ์์ง ์ค...")
|
| 1424 |
|
| 1425 |
# ์์งํ URL ๋ชฉ๋ก
|
|
|
|
| 1440 |
response.raise_for_status()
|
| 1441 |
response.encoding = 'utf-8'
|
| 1442 |
|
| 1443 |
+
text = response.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1444 |
|
| 1445 |
+
# ํจํด: [์ ๋ชฉ](๋งํฌ)...๋ ์ง
|
| 1446 |
+
# ์: [MS "๊ธ์ฆํ๋ '์ฑGPT' ์์๋ก ๋ฐ์ดํฐ์ผํฐ ๋ถ์กฑ...2026๋
๊น์ง ์ง์๋ ๋ฏ"](https://www.aitimes.com/news/articleView.html?idxno=203055)
|
| 1447 |
+
# ...
|
| 1448 |
+
# ์ฐ์
์ผ๋ฐ๋ฐ์ฐฌ ๊ธฐ์10-10 15:10
|
| 1449 |
|
| 1450 |
+
# ์ ๋ชฉ๊ณผ ๋งํฌ ๋งค์นญ ํจํด
|
| 1451 |
+
pattern = r'\[([^\]]+)\]\((https://www\.aitimes\.com/news/articleView\.html\?idxno=\d+)\)'
|
| 1452 |
|
| 1453 |
+
matches = re.finditer(pattern, text)
|
|
|
|
|
|
|
| 1454 |
|
| 1455 |
+
articles_found = 0
|
| 1456 |
+
for match in matches:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1457 |
try:
|
| 1458 |
+
title = match.group(1).strip()
|
| 1459 |
+
link = match.group(2).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1460 |
|
| 1461 |
+
# ์ ๋ชฉ์ด ๋๋ฌด ์งง์ผ๋ฉด ์คํต
|
| 1462 |
+
if len(title) < 10:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1463 |
continue
|
| 1464 |
|
| 1465 |
+
# ํด๋น ๊ธฐ์ฌ์ ๋ ์ง ์ฐพ๊ธฐ (๋งํฌ ๋ค์์ 100์ ์ด๋ด)
|
| 1466 |
+
pos = match.end()
|
| 1467 |
+
nearby_text = text[pos:pos+200]
|
|
|
|
|
|
|
|
|
|
| 1468 |
|
| 1469 |
+
# ๋ ์ง ํจํด: 10-10 15:10 ํ์
|
| 1470 |
+
date_pattern = r'(\d{2}-\d{2}\s+\d{2}:\d{2})'
|
| 1471 |
+
date_match = re.search(date_pattern, nearby_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1472 |
|
| 1473 |
+
date_text = date_match.group(1) if date_match else today
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1474 |
|
| 1475 |
+
# ์ค๋ ๋ ์ง๋ง ํํฐ๋ง
|
| 1476 |
+
if today not in date_text:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1477 |
continue
|
| 1478 |
|
| 1479 |
news_item = {
|
| 1480 |
'title': title,
|
| 1481 |
'url': link,
|
| 1482 |
+
'date': date_text,
|
| 1483 |
'source': 'AI Times',
|
| 1484 |
'category': 'AI'
|
| 1485 |
}
|
| 1486 |
|
| 1487 |
all_news.append(news_item)
|
| 1488 |
+
articles_found += 1
|
| 1489 |
+
|
| 1490 |
+
print(f" โ ์ถ๊ฐ: {title[:60]}... ({date_text})")
|
| 1491 |
|
| 1492 |
except Exception as e:
|
| 1493 |
continue
|
| 1494 |
|
| 1495 |
+
print(f" โ {articles_found}๊ฐ ์ค๋์ ๊ธฐ์ฌ ๋ฐ๊ฒฌ\n")
|
| 1496 |
time.sleep(1) # ์๋ฒ ๋ถํ ๋ฐฉ์ง
|
| 1497 |
|
| 1498 |
except Exception as e:
|
| 1499 |
+
print(f" โ ๏ธ URL ์์ง ์ค๋ฅ: {e}\n")
|
| 1500 |
continue
|
| 1501 |
|
| 1502 |
# ์ค๋ณต ์ ๊ฑฐ (URL ๊ธฐ์ค)
|
|
|
|
| 1507 |
unique_news.append(news)
|
| 1508 |
seen_urls.add(news['url'])
|
| 1509 |
|
| 1510 |
+
print(f"โ
์ด {len(unique_news)}๊ฐ ์ค๋ณต ์ ๊ฑฐ๋ ์ค๋์ ๋ด์ค\n")
|
| 1511 |
|
| 1512 |
# ์ต์ 3๊ฐ๋ ๋ณด์ฅ (์์ผ๋ฉด ์ํ ์ถ๊ฐ)
|
| 1513 |
if len(unique_news) < 3:
|
| 1514 |
+
print("โ ๏ธ ๋ด์ค๊ฐ ๋ถ์กฑํ์ฌ ์ต๊ทผ ์ํ ์ถ๊ฐ\n")
|
| 1515 |
sample_news = [
|
| 1516 |
{
|
| 1517 |
'title': 'MS "์ฑGPT ์์ ํญ์ฆ์ผ๋ก ๋ฐ์ดํฐ์ผํฐ ๋ถ์กฑ...2026๋
๊น์ง ์ง์"',
|