ginipick commited on
Commit
4a10154
ยท
verified ยท
1 Parent(s): 7352b74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -80
app.py CHANGED
@@ -1418,6 +1418,8 @@ class AdvancedAIAnalyzer:
1418
 
1419
  def fetch_aitimes_news(self) -> List[Dict]:
1420
  """AI Times์—์„œ ์˜ค๋Š˜ ๋‚ ์งœ ๋‰ด์Šค ํฌ๋กค๋ง"""
 
 
1421
  print("๐Ÿ“ฐ AI Times ๋‰ด์Šค ์ˆ˜์ง‘ ์ค‘...")
1422
 
1423
  # ์ˆ˜์ง‘ํ•  URL ๋ชฉ๋ก
@@ -1438,112 +1440,63 @@ class AdvancedAIAnalyzer:
1438
  response.raise_for_status()
1439
  response.encoding = 'utf-8'
1440
 
1441
- soup = BeautifulSoup(response.text, 'html.parser')
1442
-
1443
- # ๋‰ด์Šค ๊ธฐ์‚ฌ ์ฐพ๊ธฐ - ๋‹ค์–‘ํ•œ ํŒจํ„ด ์‹œ๋„
1444
- articles = []
1445
-
1446
- # ํŒจํ„ด 1: article-list-content
1447
- articles.extend(soup.find_all('div', class_='article-list-content'))
1448
 
1449
- # ํŒจํ„ด 2: list-block
1450
- articles.extend(soup.find_all('div', class_='list-block'))
 
 
1451
 
1452
- # ํŒจํ„ด 3: article-list
1453
- articles.extend(soup.find_all('li', class_='article-list'))
1454
 
1455
- # ํŒจํ„ด 4: ๋‹จ์ˆœ article ํƒœ๊ทธ
1456
- if not articles:
1457
- articles.extend(soup.find_all('article'))
1458
 
1459
- # ํŒจํ„ด 5: ๋ฆฌ์ŠคํŠธ ์•„์ดํ…œ
1460
- if not articles:
1461
- articles.extend(soup.find_all('li'))
1462
-
1463
- print(f" โ†’ {len(articles)}๊ฐœ ๊ธฐ์‚ฌ ๋ธ”๋ก ๋ฐœ๊ฒฌ")
1464
-
1465
- for article in articles:
1466
  try:
1467
- # ์ œ๋ชฉ๊ณผ ๋งํฌ ์ฐพ๊ธฐ - ์—ฌ๋Ÿฌ ๋ฐฉ๋ฒ• ์‹œ๋„
1468
- title_tag = None
1469
- link = None
1470
-
1471
- # a ํƒœ๊ทธ์—์„œ ์ฐพ๊ธฐ
1472
- a_tag = article.find('a')
1473
- if a_tag:
1474
- title_tag = a_tag
1475
- link = a_tag.get('href', '')
1476
 
1477
- # strong ํƒœ๊ทธ์—์„œ ์ œ๋ชฉ ๋ณด๊ฐ•
1478
- if title_tag:
1479
- strong_tag = title_tag.find('strong')
1480
- if strong_tag:
1481
- title = strong_tag.get_text(strip=True)
1482
- else:
1483
- title = title_tag.get_text(strip=True)
1484
- else:
1485
  continue
1486
 
1487
- # ๋งํฌ ์ •๊ทœํ™”
1488
- if link and not link.startswith('http'):
1489
- if link.startswith('/'):
1490
- link = 'https://www.aitimes.com' + link
1491
- else:
1492
- link = 'https://www.aitimes.com/' + link
1493
 
1494
- # ๋‚ ์งœ ์ฐพ๊ธฐ - ์—ฌ๋Ÿฌ ํŒจํ„ด ์‹œ๋„
1495
- date_text = ''
1496
- date_patterns = [
1497
- {'class': 'list-date'},
1498
- {'class': 'date'},
1499
- {'class': 'list-dated'},
1500
- {'class': 'byline'},
1501
- {'class': 'info'}
1502
- ]
1503
 
1504
- for pattern in date_patterns:
1505
- date_tag = article.find('span', pattern) or article.find('p', pattern)
1506
- if date_tag:
1507
- date_text = date_tag.get_text(strip=True)
1508
- break
1509
 
1510
- # ๋‚ ์งœ๊ฐ€ ์—†์œผ๋ฉด ์ „์ฒด ํ…์ŠคํŠธ์—์„œ ์ฐพ๊ธฐ
1511
- if not date_text:
1512
- text = article.get_text()
1513
- import re
1514
- date_match = re.search(r'\d{2}-\d{2}\s+\d{2}:\d{2}', text)
1515
- if date_match:
1516
- date_text = date_match.group()
1517
-
1518
- # ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ
1519
- if not title or len(title) < 10:
1520
- continue
1521
-
1522
- if not link or 'javascript:' in link:
1523
- continue
1524
-
1525
- # ์˜ค๋Š˜ ๋‚ ์งœ ํ•„ํ„ฐ๋ง (๋‚ ์งœ๊ฐ€ ์—†์œผ๋ฉด ์ผ๋‹จ ํฌํ•จ)
1526
- if date_text and today not in date_text:
1527
  continue
1528
 
1529
  news_item = {
1530
  'title': title,
1531
  'url': link,
1532
- 'date': date_text if date_text else today,
1533
  'source': 'AI Times',
1534
  'category': 'AI'
1535
  }
1536
 
1537
  all_news.append(news_item)
1538
- print(f" โœ“ ์ถ”๊ฐ€: {title[:60]}...")
 
 
1539
 
1540
  except Exception as e:
1541
  continue
1542
 
 
1543
  time.sleep(1) # ์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐฉ์ง€
1544
 
1545
  except Exception as e:
1546
- print(f" โš ๏ธ URL ์ˆ˜์ง‘ ์˜ค๋ฅ˜: {e}")
1547
  continue
1548
 
1549
  # ์ค‘๋ณต ์ œ๊ฑฐ (URL ๊ธฐ์ค€)
@@ -1554,11 +1507,11 @@ class AdvancedAIAnalyzer:
1554
  unique_news.append(news)
1555
  seen_urls.add(news['url'])
1556
 
1557
- print(f"\nโœ… ์ด {len(unique_news)}๊ฐœ ์˜ค๋Š˜์ž ๋‰ด์Šค ์ˆ˜์ง‘ ์™„๋ฃŒ\n")
1558
 
1559
  # ์ตœ์†Œ 3๊ฐœ๋Š” ๋ณด์žฅ (์—†์œผ๋ฉด ์ƒ˜ํ”Œ ์ถ”๊ฐ€)
1560
  if len(unique_news) < 3:
1561
- print("โš ๏ธ ๋‰ด์Šค๊ฐ€ ๋ถ€์กฑํ•˜์—ฌ ์ตœ๊ทผ ์ƒ˜ํ”Œ ์ถ”๊ฐ€")
1562
  sample_news = [
1563
  {
1564
  'title': 'MS "์ฑ—GPT ์ˆ˜์š” ํญ์ฆ์œผ๋กœ ๋ฐ์ดํ„ฐ์„ผํ„ฐ ๋ถ€์กฑ...2026๋…„๊นŒ์ง€ ์ง€์†"',
 
1418
 
1419
  def fetch_aitimes_news(self) -> List[Dict]:
1420
  """AI Times์—์„œ ์˜ค๋Š˜ ๋‚ ์งœ ๋‰ด์Šค ํฌ๋กค๋ง"""
1421
+ import re
1422
+
1423
  print("๐Ÿ“ฐ AI Times ๋‰ด์Šค ์ˆ˜์ง‘ ์ค‘...")
1424
 
1425
  # ์ˆ˜์ง‘ํ•  URL ๋ชฉ๋ก
 
1440
  response.raise_for_status()
1441
  response.encoding = 'utf-8'
1442
 
1443
+ text = response.text
 
 
 
 
 
 
1444
 
1445
+ # ํŒจํ„ด: [์ œ๋ชฉ](๋งํฌ)...๋‚ ์งœ
1446
+ # ์˜ˆ: [MS "๊ธ‰์ฆํ•˜๋Š” '์ฑ—GPT' ์ˆ˜์š”๋กœ ๋ฐ์ดํ„ฐ์„ผํ„ฐ ๋ถ€์กฑ...2026๋…„๊นŒ์ง€ ์ง€์†๋  ๋“ฏ"](https://www.aitimes.com/news/articleView.html?idxno=203055)
1447
+ # ...
1448
+ # ์‚ฐ์—…์ผ๋ฐ˜๋ฐ•์ฐฌ ๊ธฐ์ž10-10 15:10
1449
 
1450
+ # ์ œ๋ชฉ๊ณผ ๋งํฌ ๋งค์นญ ํŒจํ„ด
1451
+ pattern = r'\[([^\]]+)\]\((https://www\.aitimes\.com/news/articleView\.html\?idxno=\d+)\)'
1452
 
1453
+ matches = re.finditer(pattern, text)
 
 
1454
 
1455
+ articles_found = 0
1456
+ for match in matches:
 
 
 
 
 
1457
  try:
1458
+ title = match.group(1).strip()
1459
+ link = match.group(2).strip()
 
 
 
 
 
 
 
1460
 
1461
+ # ์ œ๋ชฉ์ด ๋„ˆ๋ฌด ์งง์œผ๋ฉด ์Šคํ‚ต
1462
+ if len(title) < 10:
 
 
 
 
 
 
1463
  continue
1464
 
1465
+ # ํ•ด๋‹น ๊ธฐ์‚ฌ์˜ ๋‚ ์งœ ์ฐพ๊ธฐ (๋งํฌ ๋’ค์—์„œ 100์ž ์ด๋‚ด)
1466
+ pos = match.end()
1467
+ nearby_text = text[pos:pos+200]
 
 
 
1468
 
1469
+ # ๋‚ ์งœ ํŒจํ„ด: 10-10 15:10 ํ˜•์‹
1470
+ date_pattern = r'(\d{2}-\d{2}\s+\d{2}:\d{2})'
1471
+ date_match = re.search(date_pattern, nearby_text)
 
 
 
 
 
 
1472
 
1473
+ date_text = date_match.group(1) if date_match else today
 
 
 
 
1474
 
1475
+ # ์˜ค๋Š˜ ๋‚ ์งœ๋งŒ ํ•„ํ„ฐ๋ง
1476
+ if today not in date_text:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1477
  continue
1478
 
1479
  news_item = {
1480
  'title': title,
1481
  'url': link,
1482
+ 'date': date_text,
1483
  'source': 'AI Times',
1484
  'category': 'AI'
1485
  }
1486
 
1487
  all_news.append(news_item)
1488
+ articles_found += 1
1489
+
1490
+ print(f" โœ“ ์ถ”๊ฐ€: {title[:60]}... ({date_text})")
1491
 
1492
  except Exception as e:
1493
  continue
1494
 
1495
+ print(f" โ†’ {articles_found}๊ฐœ ์˜ค๋Š˜์ž ๊ธฐ์‚ฌ ๋ฐœ๊ฒฌ\n")
1496
  time.sleep(1) # ์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐฉ์ง€
1497
 
1498
  except Exception as e:
1499
+ print(f" โš ๏ธ URL ์ˆ˜์ง‘ ์˜ค๋ฅ˜: {e}\n")
1500
  continue
1501
 
1502
  # ์ค‘๋ณต ์ œ๊ฑฐ (URL ๊ธฐ์ค€)
 
1507
  unique_news.append(news)
1508
  seen_urls.add(news['url'])
1509
 
1510
+ print(f"โœ… ์ด {len(unique_news)}๊ฐœ ์ค‘๋ณต ์ œ๊ฑฐ๋œ ์˜ค๋Š˜์ž ๋‰ด์Šค\n")
1511
 
1512
  # ์ตœ์†Œ 3๊ฐœ๋Š” ๋ณด์žฅ (์—†์œผ๋ฉด ์ƒ˜ํ”Œ ์ถ”๊ฐ€)
1513
  if len(unique_news) < 3:
1514
+ print("โš ๏ธ ๋‰ด์Šค๊ฐ€ ๋ถ€์กฑํ•˜์—ฌ ์ตœ๊ทผ ์ƒ˜ํ”Œ ์ถ”๊ฐ€\n")
1515
  sample_news = [
1516
  {
1517
  'title': 'MS "์ฑ—GPT ์ˆ˜์š” ํญ์ฆ์œผ๋กœ ๋ฐ์ดํ„ฐ์„ผํ„ฐ ๋ถ€์กฑ...2026๋…„๊นŒ์ง€ ์ง€์†"',