Spaces:

ginigen
/

Today

Running

App Files Files Community

ginipick commited on Oct 10

Commit

4a10154

verified ·

1 Parent(s): 7352b74

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -80

app.py CHANGED Viewed

@@ -1418,6 +1418,8 @@ class AdvancedAIAnalyzer:
     def fetch_aitimes_news(self) -> List[Dict]:
         """AI Times에서 오늘 날짜 뉴스 크롤링"""
         print("📰 AI Times 뉴스 수집 중...")
         # 수집할 URL 목록
@@ -1438,112 +1440,63 @@ class AdvancedAIAnalyzer:
                 response.raise_for_status()
                 response.encoding = 'utf-8'
-                soup = BeautifulSoup(response.text, 'html.parser')
-                # 뉴스 기사 찾기 - 다양한 패턴 시도
-                articles = []
-                # 패턴 1: article-list-content
-                articles.extend(soup.find_all('div', class_='article-list-content'))
-                # 패턴 2: list-block
-                articles.extend(soup.find_all('div', class_='list-block'))
-                # 패턴 3: article-list
-                articles.extend(soup.find_all('li', class_='article-list'))
-                # 패턴 4: 단순 article 태그
-                if not articles:
-                    articles.extend(soup.find_all('article'))
-                # 패턴 5: 리스트 아이템
-                if not articles:
-                    articles.extend(soup.find_all('li'))
-                print(f"    → {len(articles)}개 기사 블록 발견")
-                for article in articles:
                     try:
-                        # 제목과 링크 찾기 - 여러 방법 시도
-                        title_tag = None
-                        link = None
-                        # a 태그에서 찾기
-                        a_tag = article.find('a')
-                        if a_tag:
-                            title_tag = a_tag
-                            link = a_tag.get('href', '')
-                        # strong 태그에서 제목 보강
-                        if title_tag:
-                            strong_tag = title_tag.find('strong')
-                            if strong_tag:
-                                title = strong_tag.get_text(strip=True)
-                            else:
-                                title = title_tag.get_text(strip=True)
-                        else:
                             continue
-                        # 링크 정규화
-                        if link and not link.startswith('http'):
-                            if link.startswith('/'):
-                                link = 'https://www.aitimes.com' + link
-                            else:
-                                link = 'https://www.aitimes.com/' + link
-                        # 날짜 찾기 - 여러 패턴 시도
-                        date_text = ''
-                        date_patterns = [
-                            {'class': 'list-date'},
-                            {'class': 'date'},
-                            {'class': 'list-dated'},
-                            {'class': 'byline'},
-                            {'class': 'info'}
-                        ]
-                        for pattern in date_patterns:
-                            date_tag = article.find('span', pattern) or article.find('p', pattern)
-                            if date_tag:
-                                date_text = date_tag.get_text(strip=True)
-                                break
-                        # 날짜가 없으면 전체 텍스트에서 찾기
-                        if not date_text:
-                            text = article.get_text()
-                            import re
-                            date_match = re.search(r'\d{2}-\d{2}\s+\d{2}:\d{2}', text)
-                            if date_match:
-                                date_text = date_match.group()
-                        # 유효성 검사
-                        if not title or len(title) < 10:
-                            continue
-                        if not link or 'javascript:' in link:
-                            continue
-                        # 오늘 날짜 필터링 (날짜가 없으면 일단 포함)
-                        if date_text and today not in date_text:
                             continue
                         news_item = {
                             'title': title,
                             'url': link,
-                            'date': date_text if date_text else today,
                             'source': 'AI Times',
                             'category': 'AI'
                         }
                         all_news.append(news_item)
-                        print(f"    ✓ 추가: {title[:60]}...")
                     except Exception as e:
                         continue
                 time.sleep(1)  # 서버 부하 방지
             except Exception as e:
-                print(f"  ⚠️ URL 수집 오류: {e}")
                 continue
         # 중복 제거 (URL 기준)
@@ -1554,11 +1507,11 @@ class AdvancedAIAnalyzer:
                 unique_news.append(news)
                 seen_urls.add(news['url'])
-        print(f"\n✅ 총 {len(unique_news)}개 오늘자 뉴스 수집 완료\n")
         # 최소 3개는 보장 (없으면 샘플 추가)
         if len(unique_news) < 3:
-            print("⚠️ 뉴스가 부족하여 최근 샘플 추가")
             sample_news = [
                 {
                     'title': 'MS "챗GPT 수요 폭증으로 데이터센터 부족...2026년까지 지속"',

     def fetch_aitimes_news(self) -> List[Dict]:
         """AI Times에서 오늘 날짜 뉴스 크롤링"""
+        import re
         print("📰 AI Times 뉴스 수집 중...")
         # 수집할 URL 목록
                 response.raise_for_status()
                 response.encoding = 'utf-8'
+                text = response.text
+                # 패턴: [제목](링크)...날짜
+                # 예: [MS "급증하는 '챗GPT' 수요로 데이터센터 부족...2026년까지 지속될 듯"](https://www.aitimes.com/news/articleView.html?idxno=203055)
+                # ...
+                # 산업일반박찬 기자10-10 15:10
+                # 제목과 링크 매칭 패턴
+                pattern = r'\[([^\]]+)\]\((https://www\.aitimes\.com/news/articleView\.html\?idxno=\d+)\)'
+                matches = re.finditer(pattern, text)
+                articles_found = 0
+                for match in matches:
                     try:
+                        title = match.group(1).strip()
+                        link = match.group(2).strip()
+                        # 제목이 너무 짧으면 스킵
+                        if len(title) < 10:
                             continue
+                        # 해당 기사의 날짜 찾기 (링크 뒤에서 100자 이내)
+                        pos = match.end()
+                        nearby_text = text[pos:pos+200]
+                        # 날짜 패턴: 10-10 15:10 형식
+                        date_pattern = r'(\d{2}-\d{2}\s+\d{2}:\d{2})'
+                        date_match = re.search(date_pattern, nearby_text)
+                        date_text = date_match.group(1) if date_match else today
+                        # 오늘 날짜만 필터링
+                        if today not in date_text:
                             continue
                         news_item = {
                             'title': title,
                             'url': link,
+                            'date': date_text,
                             'source': 'AI Times',
                             'category': 'AI'
                         }
                         all_news.append(news_item)
+                        articles_found += 1
+                        print(f"    ✓ 추가: {title[:60]}... ({date_text})")
                     except Exception as e:
                         continue
+                print(f"    → {articles_found}개 오늘자 기사 발견\n")
                 time.sleep(1)  # 서버 부하 방지
             except Exception as e:
+                print(f"  ⚠️ URL 수집 오류: {e}\n")
                 continue
         # 중복 제거 (URL 기준)
                 unique_news.append(news)
                 seen_urls.add(news['url'])
+        print(f"✅ 총 {len(unique_news)}개 중복 제거된 오늘자 뉴스\n")
         # 최소 3개는 보장 (없으면 샘플 추가)
         if len(unique_news) < 3:
+            print("⚠️ 뉴스가 부족하여 최근 샘플 추가\n")
             sample_news = [
                 {
                     'title': 'MS "챗GPT 수요 폭증으로 데이터센터 부족...2026년까지 지속"',